From dad1581944139bb104965340804d1fb4518aab2c Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Thu, 15 Oct 2015 07:55:59 +0200 Subject: netfilter: ebtables: use __u64 from linux/types.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes userspace compilation error: linux/netfilter_bridge/ebtables.h:38:2: error: unknown type name ‘uint64_t’ Signed-off-by: Mikko Rapeli Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_bridge/ebtables.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h index fd2ee501726d..e3cdf9f1a259 100644 --- a/include/uapi/linux/netfilter_bridge/ebtables.h +++ b/include/uapi/linux/netfilter_bridge/ebtables.h @@ -12,6 +12,8 @@ #ifndef _UAPI__LINUX_BRIDGE_EFF_H #define _UAPI__LINUX_BRIDGE_EFF_H +#include +#include #include #define EBT_TABLE_MAXNAMELEN 32 @@ -33,8 +35,8 @@ struct xt_match; struct xt_target; struct ebt_counter { - uint64_t pcnt; - uint64_t bcnt; + __u64 pcnt; + __u64 bcnt; }; struct ebt_replace { -- cgit v1.2.3 From 1ffad83dffd675cd742286ae82dca7d746cb0da8 Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Thu, 15 Oct 2015 07:56:30 +0200 Subject: netfilter: fix include files for compilation Add missing header dependencies and other small changes so that each file compiles alone in userspace. Signed-off-by: Mikko Rapeli Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/ipset/ip_set_bitmap.h | 2 ++ include/uapi/linux/netfilter/ipset/ip_set_hash.h | 2 ++ include/uapi/linux/netfilter/ipset/ip_set_list.h | 2 ++ include/uapi/linux/netfilter/nf_conntrack_tuple_common.h | 3 +++ include/uapi/linux/netfilter/xt_HMARK.h | 1 + include/uapi/linux/netfilter/xt_RATEEST.h | 1 + include/uapi/linux/netfilter/xt_TEE.h | 2 ++ include/uapi/linux/netfilter/xt_TPROXY.h | 1 + include/uapi/linux/netfilter/xt_hashlimit.h | 1 + include/uapi/linux/netfilter/xt_ipvs.h | 1 + include/uapi/linux/netfilter/xt_mac.h | 2 ++ include/uapi/linux/netfilter/xt_osf.h | 2 ++ include/uapi/linux/netfilter/xt_physdev.h | 2 +- include/uapi/linux/netfilter/xt_policy.h | 2 ++ include/uapi/linux/netfilter/xt_rateest.h | 1 + include/uapi/linux/netfilter/xt_recent.h | 1 + include/uapi/linux/netfilter/xt_sctp.h | 12 ++++++------ include/uapi/linux/netfilter_arp/arp_tables.h | 1 + include/uapi/linux/netfilter_bridge.h | 1 + include/uapi/linux/netfilter_bridge/ebt_arp.h | 1 + include/uapi/linux/netfilter_bridge/ebt_arpreply.h | 2 ++ include/uapi/linux/netfilter_bridge/ebt_ip6.h | 1 + include/uapi/linux/netfilter_bridge/ebt_nat.h | 2 ++ include/uapi/linux/netfilter_ipv4/ip_tables.h | 1 + include/uapi/linux/netfilter_ipv6/ip6_tables.h | 1 + include/uapi/linux/netfilter_ipv6/ip6t_rt.h | 2 +- 26 files changed, 42 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/ipset/ip_set_bitmap.h b/include/uapi/linux/netfilter/ipset/ip_set_bitmap.h index 6a2c038d1888..fd5024d26269 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set_bitmap.h +++ b/include/uapi/linux/netfilter/ipset/ip_set_bitmap.h @@ -1,6 +1,8 @@ #ifndef _UAPI__IP_SET_BITMAP_H #define _UAPI__IP_SET_BITMAP_H +#include + /* Bitmap type specific error codes */ enum { /* The element is out of the range of the set */ diff --git a/include/uapi/linux/netfilter/ipset/ip_set_hash.h b/include/uapi/linux/netfilter/ipset/ip_set_hash.h index 352eeccdc7f2..82deeb883ac4 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set_hash.h +++ b/include/uapi/linux/netfilter/ipset/ip_set_hash.h @@ -1,6 +1,8 @@ #ifndef _UAPI__IP_SET_HASH_H #define _UAPI__IP_SET_HASH_H +#include + /* Hash type specific error codes */ enum { /* Hash is full */ diff --git a/include/uapi/linux/netfilter/ipset/ip_set_list.h b/include/uapi/linux/netfilter/ipset/ip_set_list.h index a44efaa98213..84d430368266 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set_list.h +++ b/include/uapi/linux/netfilter/ipset/ip_set_list.h @@ -1,6 +1,8 @@ #ifndef _UAPI__IP_SET_LIST_H #define _UAPI__IP_SET_LIST_H +#include + /* List type specific error codes */ enum { /* Set name to be added/deleted/tested does not exist. */ diff --git a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h index 2f6bbc5b8125..a9c3834abdd4 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h @@ -1,6 +1,9 @@ #ifndef _NF_CONNTRACK_TUPLE_COMMON_H #define _NF_CONNTRACK_TUPLE_COMMON_H +#include +#include + enum ip_conntrack_dir { IP_CT_DIR_ORIGINAL, IP_CT_DIR_REPLY, diff --git a/include/uapi/linux/netfilter/xt_HMARK.h b/include/uapi/linux/netfilter/xt_HMARK.h index 826fc5807577..3fb48c8d8d78 100644 --- a/include/uapi/linux/netfilter/xt_HMARK.h +++ b/include/uapi/linux/netfilter/xt_HMARK.h @@ -2,6 +2,7 @@ #define XT_HMARK_H_ #include +#include enum { XT_HMARK_SADDR_MASK, diff --git a/include/uapi/linux/netfilter/xt_RATEEST.h b/include/uapi/linux/netfilter/xt_RATEEST.h index 6605e20ad8cf..ec1b57047e03 100644 --- a/include/uapi/linux/netfilter/xt_RATEEST.h +++ b/include/uapi/linux/netfilter/xt_RATEEST.h @@ -2,6 +2,7 @@ #define _XT_RATEEST_TARGET_H #include +#include struct xt_rateest_target_info { char name[IFNAMSIZ]; diff --git a/include/uapi/linux/netfilter/xt_TEE.h b/include/uapi/linux/netfilter/xt_TEE.h index 5c21d5c829af..01092023404b 100644 --- a/include/uapi/linux/netfilter/xt_TEE.h +++ b/include/uapi/linux/netfilter/xt_TEE.h @@ -1,6 +1,8 @@ #ifndef _XT_TEE_TARGET_H #define _XT_TEE_TARGET_H +#include + struct xt_tee_tginfo { union nf_inet_addr gw; char oif[16]; diff --git a/include/uapi/linux/netfilter/xt_TPROXY.h b/include/uapi/linux/netfilter/xt_TPROXY.h index 902043c2073f..8d693eefdc1f 100644 --- a/include/uapi/linux/netfilter/xt_TPROXY.h +++ b/include/uapi/linux/netfilter/xt_TPROXY.h @@ -2,6 +2,7 @@ #define _XT_TPROXY_H #include +#include /* TPROXY target is capable of marking the packet to perform * redirection. We can get rid of that whenever we get support for diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h b/include/uapi/linux/netfilter/xt_hashlimit.h index cbfc43d1af68..6db90372f09c 100644 --- a/include/uapi/linux/netfilter/xt_hashlimit.h +++ b/include/uapi/linux/netfilter/xt_hashlimit.h @@ -2,6 +2,7 @@ #define _UAPI_XT_HASHLIMIT_H #include +#include /* timings are in milliseconds. */ #define XT_HASHLIMIT_SCALE 10000 diff --git a/include/uapi/linux/netfilter/xt_ipvs.h b/include/uapi/linux/netfilter/xt_ipvs.h index eff34ac18808..e03b9c31a39d 100644 --- a/include/uapi/linux/netfilter/xt_ipvs.h +++ b/include/uapi/linux/netfilter/xt_ipvs.h @@ -2,6 +2,7 @@ #define _XT_IPVS_H #include +#include enum { XT_IPVS_IPVS_PROPERTY = 1 << 0, /* all other options imply this one */ diff --git a/include/uapi/linux/netfilter/xt_mac.h b/include/uapi/linux/netfilter/xt_mac.h index b892cdc67e06..9a19a08a9181 100644 --- a/include/uapi/linux/netfilter/xt_mac.h +++ b/include/uapi/linux/netfilter/xt_mac.h @@ -1,6 +1,8 @@ #ifndef _XT_MAC_H #define _XT_MAC_H +#include + struct xt_mac_info { unsigned char srcaddr[ETH_ALEN]; int invert; diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h index 5d66caeba3ee..e6159958b2fb 100644 --- a/include/uapi/linux/netfilter/xt_osf.h +++ b/include/uapi/linux/netfilter/xt_osf.h @@ -20,6 +20,8 @@ #define _XT_OSF_H #include +#include +#include #define MAXGENRELEN 32 diff --git a/include/uapi/linux/netfilter/xt_physdev.h b/include/uapi/linux/netfilter/xt_physdev.h index db7a2982e9c0..ccdde87da214 100644 --- a/include/uapi/linux/netfilter/xt_physdev.h +++ b/include/uapi/linux/netfilter/xt_physdev.h @@ -2,7 +2,7 @@ #define _UAPI_XT_PHYSDEV_H #include - +#include #define XT_PHYSDEV_OP_IN 0x01 #define XT_PHYSDEV_OP_OUT 0x02 diff --git a/include/uapi/linux/netfilter/xt_policy.h b/include/uapi/linux/netfilter/xt_policy.h index be8ead05c316..d8a9800dce61 100644 --- a/include/uapi/linux/netfilter/xt_policy.h +++ b/include/uapi/linux/netfilter/xt_policy.h @@ -2,6 +2,8 @@ #define _XT_POLICY_H #include +#include +#include #define XT_POLICY_MAX_ELEM 4 diff --git a/include/uapi/linux/netfilter/xt_rateest.h b/include/uapi/linux/netfilter/xt_rateest.h index d40a6196842a..13fe50d4e4b3 100644 --- a/include/uapi/linux/netfilter/xt_rateest.h +++ b/include/uapi/linux/netfilter/xt_rateest.h @@ -2,6 +2,7 @@ #define _XT_RATEEST_MATCH_H #include +#include enum xt_rateest_match_flags { XT_RATEEST_MATCH_INVERT = 1<<0, diff --git a/include/uapi/linux/netfilter/xt_recent.h b/include/uapi/linux/netfilter/xt_recent.h index 6ef36c113e89..955d562031cc 100644 --- a/include/uapi/linux/netfilter/xt_recent.h +++ b/include/uapi/linux/netfilter/xt_recent.h @@ -2,6 +2,7 @@ #define _LINUX_NETFILTER_XT_RECENT_H 1 #include +#include enum { XT_RECENT_CHECK = 1 << 0, diff --git a/include/uapi/linux/netfilter/xt_sctp.h b/include/uapi/linux/netfilter/xt_sctp.h index 29287be696a2..58ffcfb7978e 100644 --- a/include/uapi/linux/netfilter/xt_sctp.h +++ b/include/uapi/linux/netfilter/xt_sctp.h @@ -66,26 +66,26 @@ struct xt_sctp_info { #define SCTP_CHUNKMAP_IS_CLEAR(chunkmap) \ __sctp_chunkmap_is_clear((chunkmap), ARRAY_SIZE(chunkmap)) -static inline bool +static inline _Bool __sctp_chunkmap_is_clear(const __u32 *chunkmap, unsigned int n) { unsigned int i; for (i = 0; i < n; ++i) if (chunkmap[i]) - return false; - return true; + return 0; + return 1; } #define SCTP_CHUNKMAP_IS_ALL_SET(chunkmap) \ __sctp_chunkmap_is_all_set((chunkmap), ARRAY_SIZE(chunkmap)) -static inline bool +static inline _Bool __sctp_chunkmap_is_all_set(const __u32 *chunkmap, unsigned int n) { unsigned int i; for (i = 0; i < n; ++i) if (chunkmap[i] != ~0U) - return false; - return true; + return 0; + return 1; } #endif /* _XT_SCTP_H_ */ diff --git a/include/uapi/linux/netfilter_arp/arp_tables.h b/include/uapi/linux/netfilter_arp/arp_tables.h index a5a86a4db6b3..ece3ad4eecda 100644 --- a/include/uapi/linux/netfilter_arp/arp_tables.h +++ b/include/uapi/linux/netfilter_arp/arp_tables.h @@ -11,6 +11,7 @@ #include #include +#include #include #include diff --git a/include/uapi/linux/netfilter_bridge.h b/include/uapi/linux/netfilter_bridge.h index a5eda6db8d79..514519b47651 100644 --- a/include/uapi/linux/netfilter_bridge.h +++ b/include/uapi/linux/netfilter_bridge.h @@ -4,6 +4,7 @@ /* bridge-specific defines for netfilter. */ +#include #include #include #include diff --git a/include/uapi/linux/netfilter_bridge/ebt_arp.h b/include/uapi/linux/netfilter_bridge/ebt_arp.h index 522f3e427f49..dd4df25330e8 100644 --- a/include/uapi/linux/netfilter_bridge/ebt_arp.h +++ b/include/uapi/linux/netfilter_bridge/ebt_arp.h @@ -2,6 +2,7 @@ #define __LINUX_BRIDGE_EBT_ARP_H #include +#include #define EBT_ARP_OPCODE 0x01 #define EBT_ARP_HTYPE 0x02 diff --git a/include/uapi/linux/netfilter_bridge/ebt_arpreply.h b/include/uapi/linux/netfilter_bridge/ebt_arpreply.h index 7e77896e1fbf..6fee3402e307 100644 --- a/include/uapi/linux/netfilter_bridge/ebt_arpreply.h +++ b/include/uapi/linux/netfilter_bridge/ebt_arpreply.h @@ -1,6 +1,8 @@ #ifndef __LINUX_BRIDGE_EBT_ARPREPLY_H #define __LINUX_BRIDGE_EBT_ARPREPLY_H +#include + struct ebt_arpreply_info { unsigned char mac[ETH_ALEN]; int target; diff --git a/include/uapi/linux/netfilter_bridge/ebt_ip6.h b/include/uapi/linux/netfilter_bridge/ebt_ip6.h index 42b889682721..a062f0ce95f9 100644 --- a/include/uapi/linux/netfilter_bridge/ebt_ip6.h +++ b/include/uapi/linux/netfilter_bridge/ebt_ip6.h @@ -13,6 +13,7 @@ #define __LINUX_BRIDGE_EBT_IP6_H #include +#include #define EBT_IP6_SOURCE 0x01 #define EBT_IP6_DEST 0x02 diff --git a/include/uapi/linux/netfilter_bridge/ebt_nat.h b/include/uapi/linux/netfilter_bridge/ebt_nat.h index 5e74e3b03bd6..c990d74ee966 100644 --- a/include/uapi/linux/netfilter_bridge/ebt_nat.h +++ b/include/uapi/linux/netfilter_bridge/ebt_nat.h @@ -1,6 +1,8 @@ #ifndef __LINUX_BRIDGE_EBT_NAT_H #define __LINUX_BRIDGE_EBT_NAT_H +#include + #define NAT_ARP_BIT (0x00000010) struct ebt_nat_info { unsigned char mac[ETH_ALEN]; diff --git a/include/uapi/linux/netfilter_ipv4/ip_tables.h b/include/uapi/linux/netfilter_ipv4/ip_tables.h index f1e6ef256034..d0da53d96d93 100644 --- a/include/uapi/linux/netfilter_ipv4/ip_tables.h +++ b/include/uapi/linux/netfilter_ipv4/ip_tables.h @@ -17,6 +17,7 @@ #include #include +#include #include #include diff --git a/include/uapi/linux/netfilter_ipv6/ip6_tables.h b/include/uapi/linux/netfilter_ipv6/ip6_tables.h index 649c68062dca..d1b22653daf2 100644 --- a/include/uapi/linux/netfilter_ipv6/ip6_tables.h +++ b/include/uapi/linux/netfilter_ipv6/ip6_tables.h @@ -17,6 +17,7 @@ #include #include +#include #include #include diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_rt.h b/include/uapi/linux/netfilter_ipv6/ip6t_rt.h index 7605a5ff81cd..558f81e46fb9 100644 --- a/include/uapi/linux/netfilter_ipv6/ip6t_rt.h +++ b/include/uapi/linux/netfilter_ipv6/ip6t_rt.h @@ -2,7 +2,7 @@ #define _IP6T_RT_H #include -/*#include */ +#include #define IP6T_RT_HOPS 16 -- cgit v1.2.3 From f7ccdb96fa31305d480678b1ba81225907dd81ef Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 11 Nov 2015 20:17:37 -0200 Subject: netfilter: nf_ct_sctp: move ip_ct_sctp away from UAPI ip_ct_sctp is an internal structure, embedded by the union nf_conntrack_proto to store sctp-specific information at conntrack entries. It has no business with UAPI. This patch moves it from UAPI to a saner place, together with similar structs for other protocols. Signed-off-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_sctp.h | 13 +++++++++++++ include/uapi/linux/netfilter/nf_conntrack_sctp.h | 12 +++--------- 2 files changed, 16 insertions(+), 9 deletions(-) create mode 100644 include/linux/netfilter/nf_conntrack_sctp.h (limited to 'include/uapi/linux') diff --git a/include/linux/netfilter/nf_conntrack_sctp.h b/include/linux/netfilter/nf_conntrack_sctp.h new file mode 100644 index 000000000000..22a16a23cd8a --- /dev/null +++ b/include/linux/netfilter/nf_conntrack_sctp.h @@ -0,0 +1,13 @@ +#ifndef _NF_CONNTRACK_SCTP_H +#define _NF_CONNTRACK_SCTP_H +/* SCTP tracking. */ + +#include + +struct ip_ct_sctp { + enum sctp_conntrack state; + + __be32 vtag[IP_CT_DIR_MAX]; +}; + +#endif /* _NF_CONNTRACK_SCTP_H */ diff --git a/include/uapi/linux/netfilter/nf_conntrack_sctp.h b/include/uapi/linux/netfilter/nf_conntrack_sctp.h index ed4e776e1242..2cbc366c3fb4 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_sctp.h +++ b/include/uapi/linux/netfilter/nf_conntrack_sctp.h @@ -1,5 +1,5 @@ -#ifndef _NF_CONNTRACK_SCTP_H -#define _NF_CONNTRACK_SCTP_H +#ifndef _UAPI_NF_CONNTRACK_SCTP_H +#define _UAPI_NF_CONNTRACK_SCTP_H /* SCTP tracking. */ #include @@ -18,10 +18,4 @@ enum sctp_conntrack { SCTP_CONNTRACK_MAX }; -struct ip_ct_sctp { - enum sctp_conntrack state; - - __be32 vtag[IP_CT_DIR_MAX]; -}; - -#endif /* _NF_CONNTRACK_SCTP_H */ +#endif /* _UAPI_NF_CONNTRACK_SCTP_H */ -- cgit v1.2.3 From 7ef8f65df976369588fa1b6466668b1b6a26eb3c Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sat, 21 Nov 2015 15:57:27 +0100 Subject: net: ipmr: fix code and comment style Trivial code and comment style fixes, also removed some extra newlines, spaces and tabs. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/mroute.h | 59 ++++++------------ net/ipv4/ipmr.c | 142 ++++++++++++-------------------------------- 2 files changed, 54 insertions(+), 147 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h index a382d2c04a42..cf943016930f 100644 --- a/include/uapi/linux/mroute.h +++ b/include/uapi/linux/mroute.h @@ -4,15 +4,13 @@ #include #include -/* - * Based on the MROUTING 3.5 defines primarily to keep - * source compatibility with BSD. +/* Based on the MROUTING 3.5 defines primarily to keep + * source compatibility with BSD. * - * See the mrouted code for the original history. - * - * Protocol Independent Multicast (PIM) data structures included - * Carlos Picoto (cap@di.fc.ul.pt) + * See the mrouted code for the original history. * + * Protocol Independent Multicast (PIM) data structures included + * Carlos Picoto (cap@di.fc.ul.pt) */ #define MRT_BASE 200 @@ -34,15 +32,13 @@ #define SIOCGETSGCNT (SIOCPROTOPRIVATE+1) #define SIOCGETRPF (SIOCPROTOPRIVATE+2) -#define MAXVIFS 32 +#define MAXVIFS 32 typedef unsigned long vifbitmap_t; /* User mode code depends on this lot */ typedef unsigned short vifi_t; #define ALL_VIFS ((vifi_t)(-1)) -/* - * Same idea as select - */ - +/* Same idea as select */ + #define VIFM_SET(n,m) ((m)|=(1<<(n))) #define VIFM_CLR(n,m) ((m)&=~(1<<(n))) #define VIFM_ISSET(n,m) ((m)&(1<<(n))) @@ -50,11 +46,9 @@ typedef unsigned short vifi_t; #define VIFM_COPY(mfrom,mto) ((mto)=(mfrom)) #define VIFM_SAME(m1,m2) ((m1)==(m2)) -/* - * Passed by mrouted for an MRT_ADD_VIF - again we use the - * mrouted 3.6 structures for compatibility +/* Passed by mrouted for an MRT_ADD_VIF - again we use the + * mrouted 3.6 structures for compatibility */ - struct vifctl { vifi_t vifc_vifi; /* Index of VIF */ unsigned char vifc_flags; /* VIFF_ flags */ @@ -73,10 +67,7 @@ struct vifctl { #define VIFF_USE_IFINDEX 0x8 /* use vifc_lcl_ifindex instead of vifc_lcl_addr to find an interface */ -/* - * Cache manipulation structures for mrouted and PIMd - */ - +/* Cache manipulation structures for mrouted and PIMd */ struct mfcctl { struct in_addr mfcc_origin; /* Origin of mcast */ struct in_addr mfcc_mcastgrp; /* Group in question */ @@ -88,10 +79,7 @@ struct mfcctl { int mfcc_expire; }; -/* - * Group count retrieval for mrouted - */ - +/* Group count retrieval for mrouted */ struct sioc_sg_req { struct in_addr src; struct in_addr grp; @@ -100,10 +88,7 @@ struct sioc_sg_req { unsigned long wrong_if; }; -/* - * To get vif packet counts - */ - +/* To get vif packet counts */ struct sioc_vif_req { vifi_t vifi; /* Which iface */ unsigned long icount; /* In packets */ @@ -112,11 +97,9 @@ struct sioc_vif_req { unsigned long obytes; /* Out bytes */ }; -/* - * This is the format the mroute daemon expects to see IGMP control - * data. Magically happens to be like an IP packet as per the original +/* This is the format the mroute daemon expects to see IGMP control + * data. Magically happens to be like an IP packet as per the original */ - struct igmpmsg { __u32 unused1,unused2; unsigned char im_msgtype; /* What is this */ @@ -126,21 +109,13 @@ struct igmpmsg { struct in_addr im_src,im_dst; }; -/* - * That's all usermode folks - */ - - +/* That's all usermode folks */ #define MFC_ASSERT_THRESH (3*HZ) /* Maximal freq. of asserts */ -/* - * Pseudo messages used by mrouted - */ - +/* Pseudo messages used by mrouted */ #define IGMPMSG_NOCACHE 1 /* Kern cache fill request to mrouted */ #define IGMPMSG_WRONGVIF 2 /* For PIM assert processing (unused) */ #define IGMPMSG_WHOLEPKT 3 /* For PIM Register processing */ - #endif /* _UAPI__LINUX_MROUTE_H */ diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index e153ab7b17a1..286ede3716ee 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -102,9 +102,7 @@ static inline bool pimsm_enabled(void) static DEFINE_RWLOCK(mrt_lock); -/* - * Multicast router control variables - */ +/* Multicast router control variables */ #define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL) @@ -393,8 +391,7 @@ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) } } -static -struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) +static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) { struct net_device *dev; @@ -561,8 +558,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, struct iphdr *encap; encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); - /* - * Check that: + /* Check that: * a. packet is really sent to a multicast group * b. packet is not a NULL-REGISTER * c. packet is not truncated @@ -603,7 +599,6 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) * vif_delete - Delete a VIF entry * @notify: Set to 1, if the caller is a notifier_call */ - static int vif_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { @@ -673,7 +668,6 @@ static inline void ipmr_cache_free(struct mfc_cache *c) /* Destroy an unresolved cache entry, killing queued skbs * and reporting error to netlink readers. */ - static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) { struct net *net = read_pnet(&mrt->net); @@ -701,9 +695,7 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) ipmr_cache_free(c); } - /* Timer process for the unresolved queue. */ - static void ipmr_expire_process(unsigned long arg) { struct mr_table *mrt = (struct mr_table *)arg; @@ -743,7 +735,6 @@ out: } /* Fill oifs list. It is called under write locked mrt_lock. */ - static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache, unsigned char *ttls) { @@ -808,7 +799,6 @@ static int vif_add(struct net *net, struct mr_table *mrt, return err; } break; - case VIFF_USE_IFINDEX: case 0: if (vifc->vifc_flags == VIFF_USE_IFINDEX) { @@ -928,9 +918,7 @@ skip: return ipmr_cache_find_any_parent(mrt, vifi); } -/* - * Allocate a multicast cache entry - */ +/* Allocate a multicast cache entry */ static struct mfc_cache *ipmr_cache_alloc(void) { struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); @@ -951,10 +939,7 @@ static struct mfc_cache *ipmr_cache_alloc_unres(void) return c; } -/* - * A cache entry has gone into a resolved state from queued - */ - +/* A cache entry has gone into a resolved state from queued */ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, struct mfc_cache *uc, struct mfc_cache *c) { @@ -962,7 +947,6 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, struct nlmsgerr *e; /* Play the pending entries through our router */ - while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); @@ -1064,12 +1048,9 @@ static int ipmr_cache_report(struct mr_table *mrt, return ret; } -/* - * Queue a packet for resolution. It gets locked cache entry! - */ - -static int -ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) +/* Queue a packet for resolution. It gets locked cache entry! */ +static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, + struct sk_buff *skb) { bool found = false; int err; @@ -1087,7 +1068,6 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) if (!found) { /* Create a new entry if allowable */ - if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || (c = ipmr_cache_alloc_unres()) == NULL) { spin_unlock_bh(&mfc_unres_lock); @@ -1097,13 +1077,11 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) } /* Fill in the new cache entry */ - c->mfc_parent = -1; c->mfc_origin = iph->saddr; c->mfc_mcastgrp = iph->daddr; /* Reflect first query at mrouted. */ - err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); if (err < 0) { /* If the report failed throw the cache entry @@ -1125,7 +1103,6 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) } /* See if we can append the packet */ - if (c->mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; @@ -1138,9 +1115,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) return err; } -/* - * MFC cache manipulation by user space mroute daemon - */ +/* MFC cache manipulation by user space mroute daemon */ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) { @@ -1211,9 +1186,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, list_add_rcu(&c->list, &mrt->mfc_cache_array[line]); - /* - * Check to see if we resolved a queued list. If so we - * need to send on the frames and tidy up. + /* Check to see if we resolved a queued list. If so we + * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); @@ -1238,10 +1212,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, return 0; } -/* - * Close the multicast socket, and clear the vif tables etc - */ - +/* Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt) { int i; @@ -1249,7 +1220,6 @@ static void mroute_clean_tables(struct mr_table *mrt) struct mfc_cache *c, *next; /* Shut down all active vif entries */ - for (i = 0; i < mrt->maxvif; i++) { if (!(mrt->vif_table[i].flags & VIFF_STATIC)) vif_delete(mrt, i, 0, &list); @@ -1257,7 +1227,6 @@ static void mroute_clean_tables(struct mr_table *mrt) unregister_netdevice_many(&list); /* Wipe the cache */ - for (i = 0; i < MFC_LINES; i++) { list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { if (c->mfc_flags & MFC_STATIC) @@ -1301,11 +1270,10 @@ static void mrtsock_destruct(struct sock *sk) rtnl_unlock(); } -/* - * Socket options and virtual interface manipulation. The whole - * virtual interface system is a complete heap, but unfortunately - * that's how BSD mrouted happens to think. Maybe one day with a proper - * MOSPF/PIM router set up we can clean this up. +/* Socket options and virtual interface manipulation. The whole + * virtual interface system is a complete heap, but unfortunately + * that's how BSD mrouted happens to think. Maybe one day with a proper + * MOSPF/PIM router set up we can clean this up. */ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen) @@ -1373,10 +1341,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi rtnl_unlock(); return ret; - /* - * Manipulate the forwarding caches. These live - * in a sort of kernel/user symbiosis. - */ + /* Manipulate the forwarding caches. These live + * in a sort of kernel/user symbiosis. + */ case MRT_ADD_MFC: case MRT_DEL_MFC: parent = -1; @@ -1397,9 +1364,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi parent); rtnl_unlock(); return ret; - /* - * Control PIM assert. - */ + /* Control PIM assert. */ case MRT_ASSERT: { int v; @@ -1456,19 +1421,13 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi return ret; } #endif - /* - * Spurious command, or MRT_VERSION which you cannot - * set. - */ + /* Spurious command, or MRT_VERSION which you cannot set. */ default: return -ENOPROTOOPT; } } -/* - * Getsock opt support for the multicast routing system. - */ - +/* Getsock opt support for the multicast routing system. */ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen) { int olr; @@ -1512,10 +1471,7 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int return 0; } -/* - * The IP multicast ioctl support routines. - */ - +/* The IP multicast ioctl support routines. */ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) { struct sioc_sg_req sr; @@ -1648,7 +1604,6 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) } #endif - static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); @@ -1670,17 +1625,14 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v return NOTIFY_DONE; } - static struct notifier_block ip_mr_notifier = { .notifier_call = ipmr_device_event, }; -/* - * Encapsulate a packet by attaching a valid IPIP header to it. - * This avoids tunnel drivers and other mess and gives us the speed so - * important for multicast video. +/* Encapsulate a packet by attaching a valid IPIP header to it. + * This avoids tunnel drivers and other mess and gives us the speed so + * important for multicast video. */ - static void ip_encap(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr) { @@ -1722,9 +1674,7 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk, return dst_output(net, sk, skb); } -/* - * Processing handlers for ipmr_forward - */ +/* Processing handlers for ipmr_forward */ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *c, int vifi) @@ -1773,7 +1723,6 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, * allow to send ICMP, so that packets will disappear * to blackhole. */ - IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); ip_rt_put(rt); goto out_free; @@ -1805,8 +1754,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, IPCB(skb)->flags |= IPSKB_FORWARDED; - /* - * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally + /* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output * interfaces. It is clear, if mrouter runs a multicasting * program, it should receive packets not depending to what interface @@ -1837,7 +1785,6 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev) } /* "local" means that we should preserve one skb (for local delivery) */ - static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *cache, int local) @@ -1862,9 +1809,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, goto forward; } - /* - * Wrong interface: drop packet and (maybe) send PIM assert. - */ + /* Wrong interface: drop packet and (maybe) send PIM assert. */ if (mrt->vif_table[vif].dev != skb->dev) { if (rt_is_output_route(skb_rtable(skb))) { /* It is our own packet, looped back. @@ -1903,9 +1848,7 @@ forward: mrt->vif_table[vif].pkt_in++; mrt->vif_table[vif].bytes_in += skb->len; - /* - * Forward the frame - */ + /* Forward the frame */ if (cache->mfc_origin == htonl(INADDR_ANY) && cache->mfc_mcastgrp == htonl(INADDR_ANY)) { if (true_vifi >= 0 && @@ -1979,11 +1922,9 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) return mrt; } -/* - * Multicast packets for forwarding arrive here - * Called with rcu_read_lock(); +/* Multicast packets for forwarding arrive here + * Called with rcu_read_lock(); */ - int ip_mr_input(struct sk_buff *skb) { struct mfc_cache *cache; @@ -2034,9 +1975,7 @@ int ip_mr_input(struct sk_buff *skb) vif); } - /* - * No usable cache entry - */ + /* No usable cache entry */ if (!cache) { int vif; @@ -2078,10 +2017,7 @@ dont_forward: } #ifdef CONFIG_IP_PIMSM_V1 -/* - * Handle IGMP messages of PIMv1 - */ - +/* Handle IGMP messages of PIMv1 */ int pim_rcv_v1(struct sk_buff *skb) { struct igmphdr *pim; @@ -2406,9 +2342,8 @@ done: } #ifdef CONFIG_PROC_FS -/* - * The /proc interfaces to multicast routing : - * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif +/* The /proc interfaces to multicast routing : + * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif */ struct ipmr_vif_iter { struct seq_net_private p; @@ -2692,10 +2627,7 @@ static const struct net_protocol pim_protocol = { }; #endif - -/* - * Setup for IP multicast routing - */ +/* Setup for IP multicast routing */ static int __net_init ipmr_net_init(struct net *net) { int err; -- cgit v1.2.3 From 7ec3f7b47b8d9ad7ba425726f2c58f9ddce040df Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 24 Nov 2015 10:00:22 +0000 Subject: netfilter: nft_payload: add packet mangling support Add support for mangling packet payload. Checksum for the specified base header is updated automatically if requested, however no updates for any kind of pseudo headers are supported, meaning no stateless NAT is supported. For checksum updates different checksumming methods can be specified. The currently supported methods are NONE for no checksum updates, and INET for internet type checksums. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 9 +++ include/uapi/linux/netfilter/nf_tables.h | 17 ++++ net/netfilter/nft_payload.c | 135 +++++++++++++++++++++++++++++-- 3 files changed, 155 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index c6f400cfaac8..4ff5424909aa 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -47,6 +47,15 @@ struct nft_payload { enum nft_registers dreg:8; }; +struct nft_payload_set { + enum nft_payload_bases base:8; + u8 offset; + u8 len; + enum nft_registers sreg:8; + u8 csum_type; + u8 csum_offset; +}; + extern const struct nft_expr_ops nft_payload_fast_ops; int nft_payload_module_init(void); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index d8c8a7c9d88a..5f3ececf84b3 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -597,6 +597,17 @@ enum nft_payload_bases { NFT_PAYLOAD_TRANSPORT_HEADER, }; +/** + * enum nft_payload_csum_types - nf_tables payload expression checksum types + * + * @NFT_PAYLOAD_CSUM_NONE: no checksumming + * @NFT_PAYLOAD_CSUM_INET: internet checksum (RFC 791) + */ +enum nft_payload_csum_types { + NFT_PAYLOAD_CSUM_NONE, + NFT_PAYLOAD_CSUM_INET, +}; + /** * enum nft_payload_attributes - nf_tables payload expression netlink attributes * @@ -604,6 +615,9 @@ enum nft_payload_bases { * @NFTA_PAYLOAD_BASE: payload base (NLA_U32: nft_payload_bases) * @NFTA_PAYLOAD_OFFSET: payload offset relative to base (NLA_U32) * @NFTA_PAYLOAD_LEN: payload length (NLA_U32) + * @NFTA_PAYLOAD_SREG: source register to load data from (NLA_U32: nft_registers) + * @NFTA_PAYLOAD_CSUM_TYPE: checksum type (NLA_U32) + * @NFTA_PAYLOAD_CSUM_OFFSET: checksum offset relative to base (NLA_U32) */ enum nft_payload_attributes { NFTA_PAYLOAD_UNSPEC, @@ -611,6 +625,9 @@ enum nft_payload_attributes { NFTA_PAYLOAD_BASE, NFTA_PAYLOAD_OFFSET, NFTA_PAYLOAD_LEN, + NFTA_PAYLOAD_SREG, + NFTA_PAYLOAD_CSUM_TYPE, + NFTA_PAYLOAD_CSUM_OFFSET, __NFTA_PAYLOAD_MAX }; #define NFTA_PAYLOAD_MAX (__NFTA_PAYLOAD_MAX - 1) diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 09b4b07eb676..12cd4bf16d17 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -107,10 +107,13 @@ err: } static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { - [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_OFFSET] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_LEN] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_SREG] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_OFFSET] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_LEN] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_CSUM_OFFSET] = { .type = NLA_U32 }, }; static int nft_payload_init(const struct nft_ctx *ctx, @@ -160,6 +163,118 @@ const struct nft_expr_ops nft_payload_fast_ops = { .dump = nft_payload_dump, }; +static void nft_payload_set_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_payload_set *priv = nft_expr_priv(expr); + struct sk_buff *skb = pkt->skb; + const u32 *src = ®s->data[priv->sreg]; + int offset, csum_offset; + __wsum fsum, tsum; + __sum16 sum; + + switch (priv->base) { + case NFT_PAYLOAD_LL_HEADER: + if (!skb_mac_header_was_set(skb)) + goto err; + offset = skb_mac_header(skb) - skb->data; + break; + case NFT_PAYLOAD_NETWORK_HEADER: + offset = skb_network_offset(skb); + break; + case NFT_PAYLOAD_TRANSPORT_HEADER: + offset = pkt->xt.thoff; + break; + default: + BUG(); + } + + csum_offset = offset + priv->csum_offset; + offset += priv->offset; + + if (priv->csum_type == NFT_PAYLOAD_CSUM_INET && + (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER || + skb->ip_summed != CHECKSUM_PARTIAL)) { + if (skb_copy_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) + goto err; + + fsum = skb_checksum(skb, offset, priv->len, 0); + tsum = csum_partial(src, priv->len, 0); + sum = csum_fold(csum_add(csum_sub(~csum_unfold(sum), fsum), + tsum)); + if (sum == 0) + sum = CSUM_MANGLED_0; + + if (!skb_make_writable(skb, csum_offset + sizeof(sum)) || + skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) + goto err; + } + + if (!skb_make_writable(skb, max(offset + priv->len, 0)) || + skb_store_bits(skb, offset, src, priv->len) < 0) + goto err; + + return; +err: + regs->verdict.code = NFT_BREAK; +} + +static int nft_payload_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_payload_set *priv = nft_expr_priv(expr); + + priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); + priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); + priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + priv->sreg = nft_parse_register(tb[NFTA_PAYLOAD_SREG]); + + if (tb[NFTA_PAYLOAD_CSUM_TYPE]) + priv->csum_type = + ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE])); + if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) + priv->csum_offset = + ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_OFFSET])); + + switch (priv->csum_type) { + case NFT_PAYLOAD_CSUM_NONE: + case NFT_PAYLOAD_CSUM_INET: + break; + default: + return -EOPNOTSUPP; + } + + return nft_validate_register_load(priv->sreg, priv->len); +} + +static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_payload_set *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_PAYLOAD_SREG, priv->sreg) || + nla_put_be32(skb, NFTA_PAYLOAD_BASE, htonl(priv->base)) || + nla_put_be32(skb, NFTA_PAYLOAD_OFFSET, htonl(priv->offset)) || + nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len)) || + nla_put_be32(skb, NFTA_PAYLOAD_CSUM_TYPE, htonl(priv->csum_type)) || + nla_put_be32(skb, NFTA_PAYLOAD_CSUM_OFFSET, + htonl(priv->csum_offset))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static const struct nft_expr_ops nft_payload_set_ops = { + .type = &nft_payload_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_payload_set)), + .eval = nft_payload_set_eval, + .init = nft_payload_set_init, + .dump = nft_payload_set_dump, +}; + static const struct nft_expr_ops * nft_payload_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -167,8 +282,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, enum nft_payload_bases base; unsigned int offset, len; - if (tb[NFTA_PAYLOAD_DREG] == NULL || - tb[NFTA_PAYLOAD_BASE] == NULL || + if (tb[NFTA_PAYLOAD_BASE] == NULL || tb[NFTA_PAYLOAD_OFFSET] == NULL || tb[NFTA_PAYLOAD_LEN] == NULL) return ERR_PTR(-EINVAL); @@ -183,6 +297,15 @@ nft_payload_select_ops(const struct nft_ctx *ctx, return ERR_PTR(-EOPNOTSUPP); } + if (tb[NFTA_PAYLOAD_SREG] != NULL) { + if (tb[NFTA_PAYLOAD_DREG] != NULL) + return ERR_PTR(-EINVAL); + return &nft_payload_set_ops; + } + + if (tb[NFTA_PAYLOAD_DREG] == NULL) + return ERR_PTR(-EINVAL); + offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); -- cgit v1.2.3 From 80a19e338d458abb5a700df3fd00795c51361f06 Mon Sep 17 00:00:00 2001 From: Asias He Date: Wed, 2 Dec 2015 14:44:00 +0800 Subject: VSOCK: Introduce virtio-vsock-common.ko This module contains the common code and header files for the following virtio-vsock and virtio-vhost kernel modules. Signed-off-by: Asias He Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 209 +++++ include/uapi/linux/virtio_ids.h | 1 + include/uapi/linux/virtio_vsock.h | 89 +++ net/vmw_vsock/virtio_transport_common.c | 1272 +++++++++++++++++++++++++++++++ 4 files changed, 1571 insertions(+) create mode 100644 include/linux/virtio_vsock.h create mode 100644 include/uapi/linux/virtio_vsock.h create mode 100644 net/vmw_vsock/virtio_transport_common.c (limited to 'include/uapi/linux') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h new file mode 100644 index 000000000000..a5f3ecc038f7 --- /dev/null +++ b/include/linux/virtio_vsock.h @@ -0,0 +1,209 @@ +/* + * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so + * anyone can use the definitions to implement compatible drivers/servers: + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (C) Red Hat, Inc., 2013-2015 + * Copyright (C) Asias He , 2013 + * Copyright (C) Stefan Hajnoczi , 2015 + */ + +#ifndef _LINUX_VIRTIO_VSOCK_H +#define _LINUX_VIRTIO_VSOCK_H + +#include +#include +#include + +#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE 128 +#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE (1024 * 256) +#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE (1024 * 256) +#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4) +#define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL +#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) +#define VIRTIO_VSOCK_MAX_TX_BUF_SIZE (1024 * 1024 * 16) +#define VIRTIO_VSOCK_MAX_DGRAM_SIZE (1024 * 64) + +struct vsock_transport_recv_notify_data; +struct vsock_transport_send_notify_data; +struct sockaddr_vm; +struct vsock_sock; + +enum { + VSOCK_VQ_CTRL = 0, + VSOCK_VQ_RX = 1, /* for host to guest data */ + VSOCK_VQ_TX = 2, /* for guest to host data */ + VSOCK_VQ_MAX = 3, +}; + +/* virtio transport socket state */ +struct virtio_transport { + struct virtio_transport_pkt_ops *ops; + struct vsock_sock *vsk; + + u32 buf_size; + u32 buf_size_min; + u32 buf_size_max; + + struct mutex tx_lock; + struct mutex rx_lock; + + struct list_head rx_queue; + u32 rx_bytes; + + /* Protected by trans->tx_lock */ + u32 tx_cnt; + u32 buf_alloc; + u32 peer_fwd_cnt; + u32 peer_buf_alloc; + /* Protected by trans->rx_lock */ + u32 fwd_cnt; + + /* Protected by sk_lock */ + u16 dgram_id; + struct list_head incomplete_dgrams; /* dgram fragments */ +}; + +struct virtio_vsock_pkt { + struct virtio_vsock_hdr hdr; + struct virtio_transport *trans; + struct work_struct work; + struct list_head list; + void *buf; + u32 len; + u32 off; +}; + +struct virtio_vsock_pkt_info { + u32 remote_cid, remote_port; + struct msghdr *msg; + u32 pkt_len; + u16 type; + u16 op; + u32 flags; + u16 dgram_id; + u16 dgram_len; +}; + +struct virtio_transport_pkt_ops { + int (*send_pkt)(struct vsock_sock *vsk, + struct virtio_vsock_pkt_info *info); +}; + +void virtio_vsock_dumppkt(const char *func, + const struct virtio_vsock_pkt *pkt); + +struct sock * +virtio_transport_get_pending(struct sock *listener, + struct virtio_vsock_pkt *pkt); +struct virtio_vsock_pkt * +virtio_transport_alloc_pkt(struct vsock_sock *vsk, + struct virtio_vsock_pkt_info *info, + size_t len, + u32 src_cid, + u32 src_port, + u32 dst_cid, + u32 dst_port); +ssize_t +virtio_transport_stream_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, + int type); +int +virtio_transport_dgram_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, int flags); + +s64 virtio_transport_stream_has_data(struct vsock_sock *vsk); +s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); + +int virtio_transport_do_socket_init(struct vsock_sock *vsk, + struct vsock_sock *psk); +u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk); +u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk); +u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk); +void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val); +void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val); +void virtio_transport_set_max_buffer_size(struct vsock_sock *vs, u64 val); +int +virtio_transport_notify_poll_in(struct vsock_sock *vsk, + size_t target, + bool *data_ready_now); +int +virtio_transport_notify_poll_out(struct vsock_sock *vsk, + size_t target, + bool *space_available_now); + +int virtio_transport_notify_recv_init(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk, + size_t target, ssize_t copied, bool data_read, + struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_send_init(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data); +int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data); +int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data); +int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, + ssize_t written, struct vsock_transport_send_notify_data *data); + +u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk); +bool virtio_transport_stream_is_active(struct vsock_sock *vsk); +bool virtio_transport_stream_allow(u32 cid, u32 port); +int virtio_transport_dgram_bind(struct vsock_sock *vsk, + struct sockaddr_vm *addr); +bool virtio_transport_dgram_allow(u32 cid, u32 port); + +int virtio_transport_connect(struct vsock_sock *vsk); + +int virtio_transport_shutdown(struct vsock_sock *vsk, int mode); + +void virtio_transport_release(struct vsock_sock *vsk); + +ssize_t +virtio_transport_stream_enqueue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len); +int +virtio_transport_dgram_enqueue(struct vsock_sock *vsk, + struct sockaddr_vm *remote_addr, + struct msghdr *msg, + size_t len); + +void virtio_transport_destruct(struct vsock_sock *vsk); + +void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt); +void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt); +void virtio_transport_inc_tx_pkt(struct virtio_vsock_pkt *pkt); +void virtio_transport_dec_tx_pkt(struct virtio_vsock_pkt *pkt); +u32 virtio_transport_get_credit(struct virtio_transport *trans, u32 wanted); +void virtio_transport_put_credit(struct virtio_transport *trans, u32 credit); +#endif /* _LINUX_VIRTIO_VSOCK_H */ diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index 77925f587b15..16dcf5d06cd7 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -39,6 +39,7 @@ #define VIRTIO_ID_9P 9 /* 9p virtio console */ #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ #define VIRTIO_ID_CAIF 12 /* Virtio caif */ +#define VIRTIO_ID_VSOCK 13 /* virtio vsock transport */ #define VIRTIO_ID_GPU 16 /* virtio GPU */ #define VIRTIO_ID_INPUT 18 /* virtio input */ diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h new file mode 100644 index 000000000000..8cf9b5682628 --- /dev/null +++ b/include/uapi/linux/virtio_vsock.h @@ -0,0 +1,89 @@ +/* + * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so + * anyone can use the definitions to implement compatible drivers/servers: + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (C) Red Hat, Inc., 2013-2015 + * Copyright (C) Asias He , 2013 + * Copyright (C) Stefan Hajnoczi , 2015 + */ + +#ifndef _UAPI_LINUX_VIRTIO_VSOCK_H +#define _UAPI_LINUX_VIRTIO_VOSCK_H + +#include +#include +#include + +struct virtio_vsock_config { + __le32 guest_cid; + __le32 max_virtqueue_pairs; +}; + +struct virtio_vsock_hdr { + __le32 src_cid; + __le32 src_port; + __le32 dst_cid; + __le32 dst_port; + __le32 len; + __le16 type; /* enum virtio_vsock_type */ + __le16 op; /* enum virtio_vsock_op */ + __le32 flags; + __le32 buf_alloc; + __le32 fwd_cnt; +}; + +enum virtio_vsock_type { + VIRTIO_VSOCK_TYPE_STREAM = 1, + VIRTIO_VSOCK_TYPE_DGRAM = 2, +}; + +enum virtio_vsock_op { + VIRTIO_VSOCK_OP_INVALID = 0, + + /* Connect operations */ + VIRTIO_VSOCK_OP_REQUEST = 1, + VIRTIO_VSOCK_OP_RESPONSE = 2, + VIRTIO_VSOCK_OP_ACK = 3, + VIRTIO_VSOCK_OP_RST = 4, + VIRTIO_VSOCK_OP_SHUTDOWN = 5, + + /* To send payload */ + VIRTIO_VSOCK_OP_RW = 6, + + /* Tell the peer our credit info */ + VIRTIO_VSOCK_OP_CREDIT_UPDATE = 7, + /* Request the peer to send the credit info to us */ + VIRTIO_VSOCK_OP_CREDIT_REQUEST = 8, +}; + +/* VIRTIO_VSOCK_OP_SHUTDOWN flags values */ +enum virtio_vsock_shutdown { + VIRTIO_VSOCK_SHUTDOWN_RCV = 1, + VIRTIO_VSOCK_SHUTDOWN_SEND = 2, +}; + +#endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */ diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c new file mode 100644 index 000000000000..28f790da6f15 --- /dev/null +++ b/net/vmw_vsock/virtio_transport_common.c @@ -0,0 +1,1272 @@ +/* + * common code for virtio vsock + * + * Copyright (C) 2013-2015 Red Hat, Inc. + * Author: Asias He + * Stefan Hajnoczi + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define COOKIEBITS 24 +#define COOKIEMASK (((u32)1 << COOKIEBITS) - 1) +#define VSOCK_TIMEOUT_INIT 4 + +#define SHA_MESSAGE_WORDS 16 +#define SHA_VSOCK_WORDS 5 + +static u32 vsockcookie_secret[2][SHA_MESSAGE_WORDS - SHA_VSOCK_WORDS + + SHA_DIGEST_WORDS]; + +static DEFINE_PER_CPU(__u32[SHA_MESSAGE_WORDS + SHA_DIGEST_WORDS + + SHA_WORKSPACE_WORDS], vsock_cookie_scratch); + +static u32 cookie_hash(u32 saddr, u32 daddr, u16 sport, u16 dport, + u32 count, int c) +{ + __u32 *tmp = this_cpu_ptr(vsock_cookie_scratch); + + memcpy(tmp + SHA_VSOCK_WORDS, vsockcookie_secret[c], + sizeof(vsockcookie_secret[c])); + tmp[0] = saddr; + tmp[1] = daddr; + tmp[2] = sport; + tmp[3] = dport; + tmp[4] = count; + sha_transform(tmp + SHA_MESSAGE_WORDS, (__u8 *)tmp, + tmp + SHA_MESSAGE_WORDS + SHA_DIGEST_WORDS); + + return tmp[17]; +} + +static u32 +virtio_vsock_secure_cookie(u32 saddr, u32 daddr, u32 sport, u32 dport, + u32 count) +{ + u32 h1, h2; + + h1 = cookie_hash(saddr, daddr, sport, dport, 0, 0); + h2 = cookie_hash(saddr, daddr, sport, dport, count, 1); + + return h1 + (count << COOKIEBITS) + (h2 & COOKIEMASK); +} + +static u32 +virtio_vsock_check_cookie(u32 saddr, u32 daddr, u32 sport, u32 dport, + u32 count, u32 cookie, u32 maxdiff) +{ + u32 diff; + u32 ret; + + cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0); + + diff = (count - (cookie >> COOKIEBITS)) & ((u32)-1 >> COOKIEBITS); + pr_debug("%s: diff=%x\n", __func__, diff); + if (diff >= maxdiff) + return (u32)-1; + + ret = (cookie - + cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) + & COOKIEMASK; + pr_debug("%s: ret=%x\n", __func__, diff); + + return ret; +} + +void virtio_vsock_dumppkt(const char *func, const struct virtio_vsock_pkt *pkt) +{ + pr_debug("%s: pkt=%p, op=%d, len=%d, %d:%d---%d:%d, len=%d\n", + func, pkt, + le16_to_cpu(pkt->hdr.op), + le32_to_cpu(pkt->hdr.len), + le32_to_cpu(pkt->hdr.src_cid), + le32_to_cpu(pkt->hdr.src_port), + le32_to_cpu(pkt->hdr.dst_cid), + le32_to_cpu(pkt->hdr.dst_port), + pkt->len); +} +EXPORT_SYMBOL_GPL(virtio_vsock_dumppkt); + +struct virtio_vsock_pkt * +virtio_transport_alloc_pkt(struct vsock_sock *vsk, + struct virtio_vsock_pkt_info *info, + size_t len, + u32 src_cid, + u32 src_port, + u32 dst_cid, + u32 dst_port) +{ + struct virtio_transport *trans = vsk->trans; + struct virtio_vsock_pkt *pkt; + int err; + + BUG_ON(!trans); + + pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); + if (!pkt) + return NULL; + + pkt->hdr.type = cpu_to_le16(info->type); + pkt->hdr.op = cpu_to_le16(info->op); + pkt->hdr.src_cid = cpu_to_le32(src_cid); + pkt->hdr.src_port = cpu_to_le32(src_port); + pkt->hdr.dst_cid = cpu_to_le32(dst_cid); + pkt->hdr.dst_port = cpu_to_le32(dst_port); + pkt->hdr.flags = cpu_to_le32(info->flags); + pkt->len = len; + pkt->trans = trans; + if (info->type == VIRTIO_VSOCK_TYPE_DGRAM) + pkt->hdr.len = cpu_to_le32(len + (info->dgram_len << 16)); + else if (info->type == VIRTIO_VSOCK_TYPE_STREAM) + pkt->hdr.len = cpu_to_le32(len); + + if (info->msg && len > 0) { + pkt->buf = kmalloc(len, GFP_KERNEL); + if (!pkt->buf) + goto out_pkt; + err = memcpy_from_msg(pkt->buf, info->msg, len); + if (err) + goto out; + } + + return pkt; + +out: + kfree(pkt->buf); +out_pkt: + kfree(pkt); + return NULL; +} +EXPORT_SYMBOL_GPL(virtio_transport_alloc_pkt); + +struct sock * +virtio_transport_get_pending(struct sock *listener, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vlistener; + struct vsock_sock *vpending; + struct sockaddr_vm src; + struct sockaddr_vm dst; + struct sock *pending; + + vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid), le32_to_cpu(pkt->hdr.src_port)); + vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid), le32_to_cpu(pkt->hdr.dst_port)); + + vlistener = vsock_sk(listener); + list_for_each_entry(vpending, &vlistener->pending_links, + pending_links) { + if (vsock_addr_equals_addr(&src, &vpending->remote_addr) && + vsock_addr_equals_addr(&dst, &vpending->local_addr)) { + pending = sk_vsock(vpending); + sock_hold(pending); + return pending; + } + } + + return NULL; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_pending); + +static void virtio_transport_inc_rx_pkt(struct virtio_vsock_pkt *pkt) +{ + pkt->trans->rx_bytes += pkt->len; +} + +static void virtio_transport_dec_rx_pkt(struct virtio_vsock_pkt *pkt) +{ + pkt->trans->rx_bytes -= pkt->len; + pkt->trans->fwd_cnt += pkt->len; +} + +void virtio_transport_inc_tx_pkt(struct virtio_vsock_pkt *pkt) +{ + mutex_lock(&pkt->trans->tx_lock); + pkt->hdr.fwd_cnt = cpu_to_le32(pkt->trans->fwd_cnt); + pkt->hdr.buf_alloc = cpu_to_le32(pkt->trans->buf_alloc); + mutex_unlock(&pkt->trans->tx_lock); +} +EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt); + +void virtio_transport_dec_tx_pkt(struct virtio_vsock_pkt *pkt) +{ +} +EXPORT_SYMBOL_GPL(virtio_transport_dec_tx_pkt); + +u32 virtio_transport_get_credit(struct virtio_transport *trans, u32 credit) +{ + u32 ret; + + mutex_lock(&trans->tx_lock); + ret = trans->peer_buf_alloc - (trans->tx_cnt - trans->peer_fwd_cnt); + if (ret > credit) + ret = credit; + trans->tx_cnt += ret; + mutex_unlock(&trans->tx_lock); + + pr_debug("%s: ret=%d, buf_alloc=%d, peer_buf_alloc=%d," + "tx_cnt=%d, fwd_cnt=%d, peer_fwd_cnt=%d\n", __func__, + ret, trans->buf_alloc, trans->peer_buf_alloc, + trans->tx_cnt, trans->fwd_cnt, trans->peer_fwd_cnt); + + return ret; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_credit); + +void virtio_transport_put_credit(struct virtio_transport *trans, u32 credit) +{ + mutex_lock(&trans->tx_lock); + trans->tx_cnt -= credit; + mutex_unlock(&trans->tx_lock); +} +EXPORT_SYMBOL_GPL(virtio_transport_put_credit); + +static int virtio_transport_send_credit_update(struct vsock_sock *vsk, int type, struct virtio_vsock_hdr *hdr) +{ + struct virtio_transport *trans = vsk->trans; + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE, + .type = type, + }; + + if (hdr && type == VIRTIO_VSOCK_TYPE_DGRAM) { + info.remote_cid = le32_to_cpu(hdr->src_cid); + info.remote_port = le32_to_cpu(hdr->src_port); + } + + pr_debug("%s: sk=%p send_credit_update\n", __func__, vsk); + return trans->ops->send_pkt(vsk, &info); +} + +static int virtio_transport_send_credit_request(struct vsock_sock *vsk, int type) +{ + struct virtio_transport *trans = vsk->trans; + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_CREDIT_REQUEST, + .type = type, + }; + + pr_debug("%s: sk=%p send_credit_request\n", __func__, vsk); + return trans->ops->send_pkt(vsk, &info); +} + +static ssize_t +virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len) +{ + struct virtio_transport *trans = vsk->trans; + struct virtio_vsock_pkt *pkt; + size_t bytes, total = 0; + int err = -EFAULT; + + mutex_lock(&trans->rx_lock); + while (total < len && trans->rx_bytes > 0 && + !list_empty(&trans->rx_queue)) { + pkt = list_first_entry(&trans->rx_queue, + struct virtio_vsock_pkt, list); + + bytes = len - total; + if (bytes > pkt->len - pkt->off) + bytes = pkt->len - pkt->off; + + err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes); + if (err) + goto out; + total += bytes; + pkt->off += bytes; + if (pkt->off == pkt->len) { + virtio_transport_dec_rx_pkt(pkt); + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + } + mutex_unlock(&trans->rx_lock); + + /* Send a credit pkt to peer */ + virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM, + NULL); + + return total; + +out: + mutex_unlock(&trans->rx_lock); + if (total) + err = total; + return err; +} + +ssize_t +virtio_transport_stream_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, int flags) +{ + if (flags & MSG_PEEK) + return -EOPNOTSUPP; + + return virtio_transport_stream_do_dequeue(vsk, msg, len); +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue); + +struct dgram_skb { + struct list_head list; + struct sk_buff *skb; + u16 id; +}; + +static struct dgram_skb *dgram_id_to_skb(struct virtio_transport *trans, + u16 id) +{ + struct dgram_skb *dgram_skb; + + list_for_each_entry(dgram_skb, &trans->incomplete_dgrams, list) { + if (dgram_skb->id == id) + return dgram_skb; + } + + return NULL; +} + +static void +virtio_transport_recv_dgram(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct sk_buff *skb = NULL; + struct vsock_sock *vsk; + struct virtio_transport *trans; + size_t size; + u16 dgram_id, pkt_off, dgram_len, pkt_len; + u32 flags, len; + struct dgram_skb *dgram_skb; + + vsk = vsock_sk(sk); + trans = vsk->trans; + + /* len: dgram_len | pkt_len */ + len = le32_to_cpu(pkt->hdr.len); + dgram_len = len >> 16; + pkt_len = len & 0xFFFF; + + /* flags: dgram_id | pkt_off */ + flags = le32_to_cpu(pkt->hdr.flags); + dgram_id = flags >> 16; + pkt_off = flags & 0xFFFF; + + pr_debug("%s: dgram_len=%d, pkt_len=%d, id=%d, off=%d\n", __func__, + dgram_len, pkt_len, dgram_id, pkt_off); + + dgram_skb = dgram_id_to_skb(trans, dgram_id); + if (dgram_skb) { + /* This pkt is for a existing dgram */ + skb = dgram_skb->skb; + pr_debug("%s:found skb\n", __func__); + } + + /* Packet payload must be within datagram bounds */ + if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE) + goto drop; + if (pkt_len > dgram_len) + goto drop; + if (pkt_off > dgram_len) + goto drop; + if (dgram_len - pkt_off < pkt_len) + goto drop; + + if (!skb) { + /* This pkt is for a new dgram */ + pr_debug("%s:create skb\n", __func__); + + size = sizeof(pkt->hdr) + dgram_len; + /* Attach the packet to the socket's receive queue as an sk_buff. */ + dgram_skb = kzalloc(sizeof(struct dgram_skb), GFP_ATOMIC); + if (!dgram_skb) + goto drop; + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) { + kfree(dgram_skb); + dgram_skb = NULL; + goto drop; + } + dgram_skb->id = dgram_id; + dgram_skb->skb = skb; + list_add_tail(&dgram_skb->list, &trans->incomplete_dgrams); + + /* sk_receive_skb() will do a sock_put(), so hold here. */ + sock_hold(sk); + skb_put(skb, size); + memcpy(skb->data, &pkt->hdr, sizeof(pkt->hdr)); + } + + memcpy(skb->data + sizeof(pkt->hdr) + pkt_off, pkt->buf, pkt_len); + + pr_debug("%s:C, off=%d, pkt_len=%d, dgram_len=%d\n", __func__, + pkt_off, pkt_len, dgram_len); + + /* We are done with this dgram */ + if (pkt_off + pkt_len == dgram_len) { + pr_debug("%s:dgram_id=%d is done\n", __func__, dgram_id); + list_del(&dgram_skb->list); + kfree(dgram_skb); + sk_receive_skb(sk, skb, 0); + } + virtio_transport_free_pkt(pkt); + return; + +drop: + if (dgram_skb) { + list_del(&dgram_skb->list); + kfree(dgram_skb); + kfree_skb(skb); + sock_put(sk); + } + virtio_transport_free_pkt(pkt); +} + +int +virtio_transport_dgram_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, int flags) +{ + struct virtio_vsock_hdr *hdr; + struct sk_buff *skb; + int noblock; + int err; + int dgram_len; + + noblock = flags & MSG_DONTWAIT; + + if (flags & MSG_OOB || flags & MSG_ERRQUEUE) + return -EOPNOTSUPP; + + /* Retrieve the head sk_buff from the socket's receive queue. */ + err = 0; + skb = skb_recv_datagram(&vsk->sk, flags, noblock, &err); + if (err) + return err; + if (!skb) + return -EAGAIN; + + hdr = (struct virtio_vsock_hdr *)skb->data; + if (!hdr) + goto out; + + dgram_len = le32_to_cpu(hdr->len) >> 16; + /* Place the datagram payload in the user's iovec. */ + err = skb_copy_datagram_msg(skb, sizeof(*hdr), msg, dgram_len); + if (err) + goto out; + + if (msg->msg_name) { + /* Provide the address of the sender. */ + DECLARE_SOCKADDR(struct sockaddr_vm *, vm_addr, msg->msg_name); + vsock_addr_init(vm_addr, le32_to_cpu(hdr->src_cid), le32_to_cpu(hdr->src_port)); + msg->msg_namelen = sizeof(*vm_addr); + } + err = dgram_len; + + /* Send a credit pkt to peer */ + virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_DGRAM, hdr); + + pr_debug("%s:done, recved =%d\n", __func__, dgram_len); +out: + skb_free_datagram(&vsk->sk, skb); + return err; +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_dequeue); + +s64 virtio_transport_stream_has_data(struct vsock_sock *vsk) +{ + struct virtio_transport *trans = vsk->trans; + s64 bytes; + + mutex_lock(&trans->rx_lock); + bytes = trans->rx_bytes; + mutex_unlock(&trans->rx_lock); + + return bytes; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data); + +static s64 virtio_transport_has_space(struct vsock_sock *vsk) +{ + struct virtio_transport *trans = vsk->trans; + s64 bytes; + + bytes = trans->peer_buf_alloc - (trans->tx_cnt - trans->peer_fwd_cnt); + if (bytes < 0) + bytes = 0; + + return bytes; +} + +s64 virtio_transport_stream_has_space(struct vsock_sock *vsk) +{ + struct virtio_transport *trans = vsk->trans; + s64 bytes; + + mutex_lock(&trans->tx_lock); + bytes = virtio_transport_has_space(vsk); + mutex_unlock(&trans->tx_lock); + + pr_debug("%s: bytes=%lld\n", __func__, bytes); + + return bytes; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_has_space); + +int virtio_transport_do_socket_init(struct vsock_sock *vsk, + struct vsock_sock *psk) +{ + struct virtio_transport *trans; + + trans = kzalloc(sizeof(*trans), GFP_KERNEL); + if (!trans) + return -ENOMEM; + + vsk->trans = trans; + trans->vsk = vsk; + if (psk) { + struct virtio_transport *ptrans = psk->trans; + trans->buf_size = ptrans->buf_size; + trans->buf_size_min = ptrans->buf_size_min; + trans->buf_size_max = ptrans->buf_size_max; + trans->peer_buf_alloc = ptrans->peer_buf_alloc; + } else { + trans->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE; + trans->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE; + trans->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE; + } + + trans->buf_alloc = trans->buf_size; + + pr_debug("%s: trans->buf_alloc=%d\n", __func__, trans->buf_alloc); + + mutex_init(&trans->rx_lock); + mutex_init(&trans->tx_lock); + INIT_LIST_HEAD(&trans->rx_queue); + INIT_LIST_HEAD(&trans->incomplete_dgrams); + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init); + +u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk) +{ + struct virtio_transport *trans = vsk->trans; + + return trans->buf_size; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size); + +u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk) +{ + struct virtio_transport *trans = vsk->trans; + + return trans->buf_size_min; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size); + +u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk) +{ + struct virtio_transport *trans = vsk->trans; + + return trans->buf_size_max; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size); + +void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val) +{ + struct virtio_transport *trans = vsk->trans; + + if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) + val = VIRTIO_VSOCK_MAX_BUF_SIZE; + if (val < trans->buf_size_min) + trans->buf_size_min = val; + if (val > trans->buf_size_max) + trans->buf_size_max = val; + trans->buf_size = val; + trans->buf_alloc = val; +} +EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size); + +void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val) +{ + struct virtio_transport *trans = vsk->trans; + + if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) + val = VIRTIO_VSOCK_MAX_BUF_SIZE; + if (val > trans->buf_size) + trans->buf_size = val; + trans->buf_size_min = val; +} +EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size); + +void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val) +{ + struct virtio_transport *trans = vsk->trans; + + if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) + val = VIRTIO_VSOCK_MAX_BUF_SIZE; + if (val < trans->buf_size) + trans->buf_size = val; + trans->buf_size_max = val; +} +EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size); + +int +virtio_transport_notify_poll_in(struct vsock_sock *vsk, + size_t target, + bool *data_ready_now) +{ + if (vsock_stream_has_data(vsk)) + *data_ready_now = true; + else + *data_ready_now = false; + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_in); + +int +virtio_transport_notify_poll_out(struct vsock_sock *vsk, + size_t target, + bool *space_avail_now) +{ + s64 free_space; + + free_space = vsock_stream_has_space(vsk); + if (free_space > 0) + *space_avail_now = true; + else if (free_space == 0) + *space_avail_now = false; + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_out); + +int virtio_transport_notify_recv_init(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_init); + +int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_block); + +int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_dequeue); + +int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk, + size_t target, ssize_t copied, bool data_read, + struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_post_dequeue); + +int virtio_transport_notify_send_init(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_init); + +int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_block); + +int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_enqueue); + +int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, + ssize_t written, struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue); + +u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk) +{ + struct virtio_transport *trans = vsk->trans; + + return trans->buf_size; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat); + +bool virtio_transport_stream_is_active(struct vsock_sock *vsk) +{ + return true; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active); + +bool virtio_transport_stream_allow(u32 cid, u32 port) +{ + return true; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_allow); + +int virtio_transport_dgram_bind(struct vsock_sock *vsk, + struct sockaddr_vm *addr) +{ + return vsock_bind_dgram_generic(vsk, addr); +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind); + +bool virtio_transport_dgram_allow(u32 cid, u32 port) +{ + return true; +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_allow); + +int virtio_transport_connect(struct vsock_sock *vsk) +{ + struct virtio_transport *trans = vsk->trans; + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_REQUEST, + .type = VIRTIO_VSOCK_TYPE_STREAM, + }; + + pr_debug("%s: vsk=%p send_request\n", __func__, vsk); + return trans->ops->send_pkt(vsk, &info); +} +EXPORT_SYMBOL_GPL(virtio_transport_connect); + +int virtio_transport_shutdown(struct vsock_sock *vsk, int mode) +{ + struct virtio_transport *trans = vsk->trans; + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_SHUTDOWN, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .flags = (mode & RCV_SHUTDOWN ? + VIRTIO_VSOCK_SHUTDOWN_RCV : 0) | + (mode & SEND_SHUTDOWN ? + VIRTIO_VSOCK_SHUTDOWN_SEND : 0), + }; + + pr_debug("%s: vsk=%p: send_shutdown\n", __func__, vsk); + return trans->ops->send_pkt(vsk, &info); +} +EXPORT_SYMBOL_GPL(virtio_transport_shutdown); + +void virtio_transport_release(struct vsock_sock *vsk) +{ + struct virtio_transport *trans = vsk->trans; + struct sock *sk = &vsk->sk; + struct dgram_skb *dgram_skb; + struct dgram_skb *dgram_skb_tmp; + + pr_debug("%s: vsk=%p\n", __func__, vsk); + + /* Tell other side to terminate connection */ + if (sk->sk_type == SOCK_STREAM && sk->sk_state == SS_CONNECTED) { + virtio_transport_shutdown(vsk, SHUTDOWN_MASK); + } + + /* Free incomplete dgrams */ + lock_sock(sk); + list_for_each_entry_safe(dgram_skb, dgram_skb_tmp, + &trans->incomplete_dgrams, list) { + list_del(&dgram_skb->list); + kfree_skb(dgram_skb->skb); + kfree(dgram_skb); + sock_put(sk); /* held in virtio_transport_recv_dgram() */ + } + release_sock(sk); +} +EXPORT_SYMBOL_GPL(virtio_transport_release); + +int +virtio_transport_dgram_enqueue(struct vsock_sock *vsk, + struct sockaddr_vm *remote_addr, + struct msghdr *msg, + size_t dgram_len) +{ + struct virtio_transport *trans = vsk->trans; + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RW, + .type = VIRTIO_VSOCK_TYPE_DGRAM, + .msg = msg, + }; + size_t total_written = 0, pkt_off = 0, written; + u16 dgram_id; + + /* The max size of a single dgram we support is 64KB */ + if (dgram_len > VIRTIO_VSOCK_MAX_DGRAM_SIZE) + return -EMSGSIZE; + + info.dgram_len = dgram_len; + vsk->remote_addr = *remote_addr; + + dgram_id = trans->dgram_id++; + + /* TODO: To optimize, if we have enough credit to send the pkt already, + * do not ask the peer to send credit to use */ + virtio_transport_send_credit_request(vsk, VIRTIO_VSOCK_TYPE_DGRAM); + + while (total_written < dgram_len) { + info.pkt_len = dgram_len - total_written; + info.flags = dgram_id << 16 | pkt_off; + written = trans->ops->send_pkt(vsk, &info); + if (written < 0) + return -ENOMEM; + if (written == 0) { + /* TODO: if written = 0, we need a sleep & wakeup + * instead of sleep */ + pr_debug("%s: SHOULD WAIT written==0", __func__); + msleep(10); + } + total_written += written; + pkt_off += written; + pr_debug("%s:id=%d, dgram_len=%zu, off=%zu, total_written=%zu, written=%zu\n", + __func__, dgram_id, dgram_len, pkt_off, total_written, written); + } + + return dgram_len; +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_enqueue); + +ssize_t +virtio_transport_stream_enqueue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len) +{ + struct virtio_transport *trans = vsk->trans; + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RW, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .msg = msg, + .pkt_len = len, + }; + + return trans->ops->send_pkt(vsk, &info); +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_enqueue); + +void virtio_transport_destruct(struct vsock_sock *vsk) +{ + struct virtio_transport *trans = vsk->trans; + + pr_debug("%s: vsk=%p\n", __func__, vsk); + kfree(trans); +} +EXPORT_SYMBOL_GPL(virtio_transport_destruct); + +static int virtio_transport_send_ack(struct vsock_sock *vsk, u32 cookie) +{ + struct virtio_transport *trans = vsk->trans; + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_ACK, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .flags = cpu_to_le32(cookie), + }; + + pr_debug("%s: sk=%p send_offer\n", __func__, vsk); + return trans->ops->send_pkt(vsk, &info); +} + +static int virtio_transport_send_reset(struct vsock_sock *vsk, + struct virtio_vsock_pkt *pkt) +{ + struct virtio_transport *trans = vsk->trans; + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RST, + .type = VIRTIO_VSOCK_TYPE_STREAM, + }; + + pr_debug("%s\n", __func__); + + /* Send RST only if the original pkt is not a RST pkt */ + if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + return 0; + + return trans->ops->send_pkt(vsk, &info); +} + +static int +virtio_transport_recv_connecting(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + int err; + int skerr; + u32 cookie; + + pr_debug("%s: vsk=%p\n", __func__, vsk); + switch (le16_to_cpu(pkt->hdr.op)) { + case VIRTIO_VSOCK_OP_RESPONSE: + cookie = le32_to_cpu(pkt->hdr.flags); + pr_debug("%s: got RESPONSE and send ACK, cookie=%x\n", __func__, cookie); + err = virtio_transport_send_ack(vsk, cookie); + if (err < 0) { + skerr = -err; + goto destroy; + } + sk->sk_state = SS_CONNECTED; + sk->sk_socket->state = SS_CONNECTED; + vsock_insert_connected(vsk); + sk->sk_state_change(sk); + break; + case VIRTIO_VSOCK_OP_INVALID: + pr_debug("%s: got invalid\n", __func__); + break; + case VIRTIO_VSOCK_OP_RST: + pr_debug("%s: got rst\n", __func__); + skerr = ECONNRESET; + err = 0; + goto destroy; + default: + pr_debug("%s: got def\n", __func__); + skerr = EPROTO; + err = -EINVAL; + goto destroy; + } + return 0; + +destroy: + virtio_transport_send_reset(vsk, pkt); + sk->sk_state = SS_UNCONNECTED; + sk->sk_err = skerr; + sk->sk_error_report(sk); + return err; +} + +static int +virtio_transport_recv_connected(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct virtio_transport *trans = vsk->trans; + int err = 0; + + switch (le16_to_cpu(pkt->hdr.op)) { + case VIRTIO_VSOCK_OP_RW: + pkt->len = le32_to_cpu(pkt->hdr.len); + pkt->off = 0; + pkt->trans = trans; + + mutex_lock(&trans->rx_lock); + virtio_transport_inc_rx_pkt(pkt); + list_add_tail(&pkt->list, &trans->rx_queue); + mutex_unlock(&trans->rx_lock); + + sk->sk_data_ready(sk); + return err; + case VIRTIO_VSOCK_OP_CREDIT_UPDATE: + sk->sk_write_space(sk); + break; + case VIRTIO_VSOCK_OP_SHUTDOWN: + pr_debug("%s: got shutdown\n", __func__); + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV) + vsk->peer_shutdown |= RCV_SHUTDOWN; + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) + vsk->peer_shutdown |= SEND_SHUTDOWN; + if (le32_to_cpu(pkt->hdr.flags)) + sk->sk_state_change(sk); + break; + case VIRTIO_VSOCK_OP_RST: + pr_debug("%s: got rst\n", __func__); + sock_set_flag(sk, SOCK_DONE); + vsk->peer_shutdown = SHUTDOWN_MASK; + if (vsock_stream_has_data(vsk) <= 0) + sk->sk_state = SS_DISCONNECTING; + sk->sk_state_change(sk); + break; + default: + err = -EINVAL; + break; + } + + virtio_transport_free_pkt(pkt); + return err; +} + +static int +virtio_transport_send_response(struct vsock_sock *vsk, + struct virtio_vsock_pkt *pkt) +{ + struct virtio_transport *trans = vsk->trans; + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RESPONSE, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .remote_cid = le32_to_cpu(pkt->hdr.src_cid), + .remote_port = le32_to_cpu(pkt->hdr.src_port), + }; + u32 cookie; + + cookie = virtio_vsock_secure_cookie(le32_to_cpu(pkt->hdr.src_cid), + le32_to_cpu(pkt->hdr.dst_cid), + le32_to_cpu(pkt->hdr.src_port), + le32_to_cpu(pkt->hdr.dst_port), + jiffies / (HZ * 60)); + info.flags = cpu_to_le32(cookie); + + pr_debug("%s: send_response, cookie=%x\n", __func__, le32_to_cpu(cookie)); + + return trans->ops->send_pkt(vsk, &info); +} + +/* Handle server socket */ +static int +virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct vsock_sock *vpending; + struct sock *pending; + int err; + u32 cookie; + + switch (le16_to_cpu(pkt->hdr.op)) { + case VIRTIO_VSOCK_OP_REQUEST: + err = virtio_transport_send_response(vsk, pkt); + if (err < 0) { + // FIXME vsk should be vpending + virtio_transport_send_reset(vsk, pkt); + return err; + } + break; + case VIRTIO_VSOCK_OP_ACK: + cookie = le32_to_cpu(pkt->hdr.flags); + err = virtio_vsock_check_cookie(le32_to_cpu(pkt->hdr.src_cid), + le32_to_cpu(pkt->hdr.dst_cid), + le32_to_cpu(pkt->hdr.src_port), + le32_to_cpu(pkt->hdr.dst_port), + jiffies / (HZ * 60), + le32_to_cpu(pkt->hdr.flags), + VSOCK_TIMEOUT_INIT); + pr_debug("%s: cookie=%x, err=%d\n", __func__, cookie, err); + if (err) + return err; + + /* So no pending socket are responsible for this pkt, create one */ + pr_debug("%s: create pending\n", __func__); + pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, + sk->sk_type, 0); + if (!pending) { + virtio_transport_send_reset(vsk, pkt); + return -ENOMEM; + } + sk->sk_ack_backlog++; + pending->sk_state = SS_CONNECTING; + + vpending = vsock_sk(pending); + vsock_addr_init(&vpending->local_addr, le32_to_cpu(pkt->hdr.dst_cid), + le32_to_cpu(pkt->hdr.dst_port)); + vsock_addr_init(&vpending->remote_addr, le32_to_cpu(pkt->hdr.src_cid), + le32_to_cpu(pkt->hdr.src_port)); + vsock_add_pending(sk, pending); + + pr_debug("%s: get pending\n", __func__); + pending = virtio_transport_get_pending(sk, pkt); + vpending = vsock_sk(pending); + lock_sock(pending); + switch (pending->sk_state) { + case SS_CONNECTING: + if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_ACK) { + pr_debug("%s: op=%d != OP_ACK\n", __func__, + le16_to_cpu(pkt->hdr.op)); + virtio_transport_send_reset(vpending, pkt); + pending->sk_err = EPROTO; + pending->sk_state = SS_UNCONNECTED; + sock_put(pending); + } else { + pending->sk_state = SS_CONNECTED; + vsock_insert_connected(vpending); + + vsock_remove_pending(sk, pending); + vsock_enqueue_accept(sk, pending); + + sk->sk_data_ready(sk); + } + err = 0; + break; + default: + pr_debug("%s: sk->sk_ack_backlog=%d\n", __func__, + sk->sk_ack_backlog); + virtio_transport_send_reset(vpending, pkt); + err = -EINVAL; + break; + } + if (err < 0) + vsock_remove_pending(sk, pending); + release_sock(pending); + + /* Release refcnt obtained in virtio_transport_get_pending */ + sock_put(pending); + break; + default: + break; + } + + return 0; +} + +static void virtio_transport_space_update(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct virtio_transport *trans = vsk->trans; + bool space_available; + + /* buf_alloc and fwd_cnt is always included in the hdr */ + mutex_lock(&trans->tx_lock); + trans->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc); + trans->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt); + space_available = virtio_transport_has_space(vsk); + mutex_unlock(&trans->tx_lock); + + if (space_available) + sk->sk_write_space(sk); +} + +/* We are under the virtio-vsock's vsock->rx_lock or + * vhost-vsock's vq->mutex lock */ +void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) +{ + struct virtio_transport *trans; + struct sockaddr_vm src, dst; + struct vsock_sock *vsk; + struct sock *sk; + + vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid), le32_to_cpu(pkt->hdr.src_port)); + vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid), le32_to_cpu(pkt->hdr.dst_port)); + + virtio_vsock_dumppkt(__func__, pkt); + + if (le16_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_DGRAM) { + sk = vsock_find_unbound_socket(&dst); + if (!sk) + goto free_pkt; + + vsk = vsock_sk(sk); + trans = vsk->trans; + BUG_ON(!trans); + + virtio_transport_space_update(sk, pkt); + + lock_sock(sk); + switch (le16_to_cpu(pkt->hdr.op)) { + case VIRTIO_VSOCK_OP_CREDIT_UPDATE: + virtio_transport_free_pkt(pkt); + break; + case VIRTIO_VSOCK_OP_CREDIT_REQUEST: + virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_DGRAM, + &pkt->hdr); + virtio_transport_free_pkt(pkt); + break; + case VIRTIO_VSOCK_OP_RW: + virtio_transport_recv_dgram(sk, pkt); + break; + default: + virtio_transport_free_pkt(pkt); + break; + } + release_sock(sk); + + /* Release refcnt obtained when we fetched this socket out of + * the unbound list. + */ + sock_put(sk); + return; + } else if (le16_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_STREAM) { + /* The socket must be in connected or bound table + * otherwise send reset back + */ + sk = vsock_find_connected_socket(&src, &dst); + if (!sk) { + sk = vsock_find_bound_socket(&dst); + if (!sk) { + pr_debug("%s: can not find bound_socket\n", __func__); + virtio_vsock_dumppkt(__func__, pkt); + /* Ignore this pkt instead of sending reset back */ + /* TODO send a RST unless this packet is a RST (to avoid infinite loops) */ + goto free_pkt; + } + } + + vsk = vsock_sk(sk); + trans = vsk->trans; + BUG_ON(!trans); + + virtio_transport_space_update(sk, pkt); + + lock_sock(sk); + switch (sk->sk_state) { + case VSOCK_SS_LISTEN: + virtio_transport_recv_listen(sk, pkt); + virtio_transport_free_pkt(pkt); + break; + case SS_CONNECTING: + virtio_transport_recv_connecting(sk, pkt); + virtio_transport_free_pkt(pkt); + break; + case SS_CONNECTED: + virtio_transport_recv_connected(sk, pkt); + break; + default: + virtio_transport_free_pkt(pkt); + break; + } + release_sock(sk); + + /* Release refcnt obtained when we fetched this socket out of the + * bound or connected list. + */ + sock_put(sk); + } + return; + +free_pkt: + virtio_transport_free_pkt(pkt); +} +EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt); + +void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt) +{ + kfree(pkt->buf); + kfree(pkt); +} +EXPORT_SYMBOL_GPL(virtio_transport_free_pkt); + +static int __init virtio_vsock_common_init(void) +{ + get_random_bytes(vsockcookie_secret, sizeof(vsockcookie_secret)); + return 0; +} + +static void __exit virtio_vsock_common_exit(void) +{ +} + +module_init(virtio_vsock_common_init); +module_exit(virtio_vsock_common_exit); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Asias He"); +MODULE_DESCRIPTION("common code for virtio vsock"); -- cgit v1.2.3 From cf595922b9a3b61e308224fd29909f54ecb557d4 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Mon, 9 Nov 2015 12:17:37 +0200 Subject: nl80211: clarify NL80211_ATTR_SCHED_SCAN_DELAY usage with net-detect In this attribute's documentation, it was not clear whether the delay started counting when WoWLAN net-detect was enabled or when the system was suspended. The correct answer is that it starts when the system suspends (which is when, in practice, the scan is scheduled). Clarify that in the nl80211.h documentation. Suggested-by: Samuel Tan Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1f0b4cf5dd03..07099cb14778 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1764,8 +1764,9 @@ enum nl80211_commands { * over all channels. * * @NL80211_ATTR_SCHED_SCAN_DELAY: delay before the first cycle of a - * scheduled scan (or a WoWLAN net-detect scan) is started, u32 - * in seconds. + * scheduled scan is started. Or the delay before a WoWLAN + * net-detect scan is started, counting from the moment the + * system is suspended. This value is a u32, in seconds. * @NL80211_ATTR_REG_INDOOR: flag attribute, if set indicates that the device * is operating in an indoor environment. -- cgit v1.2.3 From 91d3ab46730379e89e1e908c6f62fbcadb3d8f08 Mon Sep 17 00:00:00 2001 From: Vidyullatha Kanchanapally Date: Fri, 30 Oct 2015 19:14:49 +0530 Subject: cfg80211: Add support for aborting an ongoing scan Implement new functionality for aborting an ongoing scan. Add NL80211_CMD_ABORT_SCAN to the nl80211 interface. After aborting the scan, driver shall provide the scan status by calling cfg80211_scan_done(). Reviewed-by: Jouni Malinen Signed-off-by: Vidyullatha Kanchanapally Signed-off-by: Sunil Dutt [change command to take wdev instead of netdev so that it can be used on p2p-device scans] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 6 ++++++ net/wireless/nl80211.c | 26 ++++++++++++++++++++++++++ net/wireless/rdev-ops.h | 8 ++++++++ net/wireless/trace.h | 4 ++++ 5 files changed, 47 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e568872203a5..9bcaaf7cd15a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2321,6 +2321,8 @@ struct cfg80211_qos_map { * the driver, and will be valid until passed to cfg80211_scan_done(). * For scan results, call cfg80211_inform_bss(); you can call this outside * the scan/scan_done bracket too. + * @abort_scan: Tell the driver to abort an ongoing scan. The driver shall + * indicate the status of the scan through cfg80211_scan_done(). * * @auth: Request to authenticate with the specified peer * (invoked with the wireless_dev mutex held) @@ -2593,6 +2595,7 @@ struct cfg80211_ops { int (*scan)(struct wiphy *wiphy, struct cfg80211_scan_request *request); + void (*abort_scan)(struct wiphy *wiphy, struct wireless_dev *wdev); int (*auth)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_auth_request *req); diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 07099cb14778..5b7b5ebe7ca8 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -820,6 +820,10 @@ * as an event to indicate changes for devices with wiphy-specific regdom * management. * + * @NL80211_CMD_ABORT_SCAN: Stop an ongoing scan. Returns -ENOENT if a scan is + * not running. The driver indicates the status of the scan through + * cfg80211_scan_done(). + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1006,6 +1010,8 @@ enum nl80211_commands { NL80211_CMD_WIPHY_REG_CHANGE, + NL80211_CMD_ABORT_SCAN, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 41e57d0c4d43..67e7b531db79 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5997,6 +5997,24 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) return err; } +static int nl80211_abort_scan(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = info->user_ptr[1]; + + if (!rdev->ops->abort_scan) + return -EOPNOTSUPP; + + if (rdev->scan_msg) + return 0; + + if (!rdev->scan_req) + return -ENOENT; + + rdev_abort_scan(rdev, wdev); + return 0; +} + static int nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans, struct cfg80211_sched_scan_request *request, @@ -10944,6 +10962,14 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_ABORT_SCAN, + .doit = nl80211_abort_scan, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, { .cmd = NL80211_CMD_GET_SCAN, .policy = nl80211_policy, diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index b8cc594d409d..8ae0c04f9fc7 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -427,6 +427,14 @@ static inline int rdev_scan(struct cfg80211_registered_device *rdev, return ret; } +static inline void rdev_abort_scan(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev) +{ + trace_rdev_abort_scan(&rdev->wiphy, wdev); + rdev->ops->abort_scan(&rdev->wiphy, wdev); + trace_rdev_return_void(&rdev->wiphy); +} + static inline int rdev_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_auth_request *req) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 5b9139e53199..09b242b09bed 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2917,6 +2917,10 @@ TRACE_EVENT(rdev_set_coalesce, WIPHY_PR_ARG, __entry->n_rules) ); +DEFINE_EVENT(wiphy_wdev_evt, rdev_abort_scan, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), + TP_ARGS(wiphy, wdev) +); #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 8ac2837c89c8c0fcad557e4380aeef80580390f9 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 9 Dec 2015 10:51:12 +0800 Subject: Revert "Merge branch 'vsock-virtio'" This reverts commit 0d76d6e8b2507983a2cae4c09880798079007421 and merge commit c402293bd76fbc93e52ef8c0947ab81eea3ae019, reversing changes made to c89359a42e2a49656451569c382eed63e781153c. The virtio-vsock device specification is not finalized yet. Michael Tsirkin voiced concerned about merging this code when the hardware interface (and possibly the userspace interface) could still change. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- drivers/vhost/Kconfig | 4 - drivers/vhost/Kconfig.vsock | 7 - drivers/vhost/Makefile | 4 - drivers/vhost/vsock.c | 630 --------------- drivers/vhost/vsock.h | 4 - include/linux/virtio_vsock.h | 209 ----- include/net/af_vsock.h | 2 - include/uapi/linux/virtio_ids.h | 1 - include/uapi/linux/virtio_vsock.h | 89 --- net/vmw_vsock/Kconfig | 18 - net/vmw_vsock/Makefile | 2 - net/vmw_vsock/af_vsock.c | 70 -- net/vmw_vsock/virtio_transport.c | 466 ----------- net/vmw_vsock/virtio_transport_common.c | 1272 ------------------------------- 14 files changed, 2778 deletions(-) delete mode 100644 drivers/vhost/Kconfig.vsock delete mode 100644 drivers/vhost/vsock.c delete mode 100644 drivers/vhost/vsock.h delete mode 100644 include/linux/virtio_vsock.h delete mode 100644 include/uapi/linux/virtio_vsock.h delete mode 100644 net/vmw_vsock/virtio_transport.c delete mode 100644 net/vmw_vsock/virtio_transport_common.c (limited to 'include/uapi/linux') diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 81449bfc8d3b..533eaf04f12f 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -47,7 +47,3 @@ config VHOST_CROSS_ENDIAN_LEGACY adds some overhead, it is disabled by default. If unsure, say "N". - -if STAGING -source "drivers/vhost/Kconfig.vsock" -endif diff --git a/drivers/vhost/Kconfig.vsock b/drivers/vhost/Kconfig.vsock deleted file mode 100644 index 3491865d3eb9..000000000000 --- a/drivers/vhost/Kconfig.vsock +++ /dev/null @@ -1,7 +0,0 @@ -config VHOST_VSOCK - tristate "vhost virtio-vsock driver" - depends on VSOCKETS && EVENTFD - select VIRTIO_VSOCKETS_COMMON - default n - ---help--- - Say M here to enable the vhost-vsock for virtio-vsock guests diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index 6b012b986b57..e0441c34db1c 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -4,9 +4,5 @@ vhost_net-y := net.o obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o vhost_scsi-y := scsi.o -obj-$(CONFIG_VHOST_VSOCK) += vhost_vsock.o -vhost_vsock-y := vsock.o - obj-$(CONFIG_VHOST_RING) += vringh.o - obj-$(CONFIG_VHOST) += vhost.o diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c deleted file mode 100644 index 64bcb10bb901..000000000000 --- a/drivers/vhost/vsock.c +++ /dev/null @@ -1,630 +0,0 @@ -/* - * vhost transport for vsock - * - * Copyright (C) 2013-2015 Red Hat, Inc. - * Author: Asias He - * Stefan Hajnoczi - * - * This work is licensed under the terms of the GNU GPL, version 2. - */ -#include -#include -#include -#include -#include -#include - -#include -#include "vhost.h" -#include "vsock.h" - -#define VHOST_VSOCK_DEFAULT_HOST_CID 2 - -static int vhost_transport_socket_init(struct vsock_sock *vsk, - struct vsock_sock *psk); - -enum { - VHOST_VSOCK_FEATURES = VHOST_FEATURES, -}; - -/* Used to track all the vhost_vsock instances on the system. */ -static LIST_HEAD(vhost_vsock_list); -static DEFINE_MUTEX(vhost_vsock_mutex); - -struct vhost_vsock_virtqueue { - struct vhost_virtqueue vq; -}; - -struct vhost_vsock { - /* Vhost device */ - struct vhost_dev dev; - /* Vhost vsock virtqueue*/ - struct vhost_vsock_virtqueue vqs[VSOCK_VQ_MAX]; - /* Link to global vhost_vsock_list*/ - struct list_head list; - /* Head for pkt from host to guest */ - struct list_head send_pkt_list; - /* Work item to send pkt */ - struct vhost_work send_pkt_work; - /* Wait queue for send pkt */ - wait_queue_head_t queue_wait; - /* Used for global tx buf limitation */ - u32 total_tx_buf; - /* Guest contex id this vhost_vsock instance handles */ - u32 guest_cid; -}; - -static u32 vhost_transport_get_local_cid(void) -{ - return VHOST_VSOCK_DEFAULT_HOST_CID; -} - -static struct vhost_vsock *vhost_vsock_get(u32 guest_cid) -{ - struct vhost_vsock *vsock; - - mutex_lock(&vhost_vsock_mutex); - list_for_each_entry(vsock, &vhost_vsock_list, list) { - if (vsock->guest_cid == guest_cid) { - mutex_unlock(&vhost_vsock_mutex); - return vsock; - } - } - mutex_unlock(&vhost_vsock_mutex); - - return NULL; -} - -static void -vhost_transport_do_send_pkt(struct vhost_vsock *vsock, - struct vhost_virtqueue *vq) -{ - bool added = false; - - mutex_lock(&vq->mutex); - vhost_disable_notify(&vsock->dev, vq); - for (;;) { - struct virtio_vsock_pkt *pkt; - struct iov_iter iov_iter; - unsigned out, in; - struct sock *sk; - size_t nbytes; - size_t len; - int head; - - if (list_empty(&vsock->send_pkt_list)) { - vhost_enable_notify(&vsock->dev, vq); - break; - } - - head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), - &out, &in, NULL, NULL); - pr_debug("%s: head = %d\n", __func__, head); - if (head < 0) - break; - - if (head == vq->num) { - if (unlikely(vhost_enable_notify(&vsock->dev, vq))) { - vhost_disable_notify(&vsock->dev, vq); - continue; - } - break; - } - - pkt = list_first_entry(&vsock->send_pkt_list, - struct virtio_vsock_pkt, list); - list_del_init(&pkt->list); - - if (out) { - virtio_transport_free_pkt(pkt); - vq_err(vq, "Expected 0 output buffers, got %u\n", out); - break; - } - - len = iov_length(&vq->iov[out], in); - iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len); - - nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter); - if (nbytes != sizeof(pkt->hdr)) { - virtio_transport_free_pkt(pkt); - vq_err(vq, "Faulted on copying pkt hdr\n"); - break; - } - - nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter); - if (nbytes != pkt->len) { - virtio_transport_free_pkt(pkt); - vq_err(vq, "Faulted on copying pkt buf\n"); - break; - } - - vhost_add_used(vq, head, pkt->len); /* TODO should this be sizeof(pkt->hdr) + pkt->len? */ - added = true; - - virtio_transport_dec_tx_pkt(pkt); - vsock->total_tx_buf -= pkt->len; - - sk = sk_vsock(pkt->trans->vsk); - /* Release refcnt taken in vhost_transport_send_pkt */ - sock_put(sk); - - virtio_transport_free_pkt(pkt); - } - if (added) - vhost_signal(&vsock->dev, vq); - mutex_unlock(&vq->mutex); - - if (added) - wake_up(&vsock->queue_wait); -} - -static void vhost_transport_send_pkt_work(struct vhost_work *work) -{ - struct vhost_virtqueue *vq; - struct vhost_vsock *vsock; - - vsock = container_of(work, struct vhost_vsock, send_pkt_work); - vq = &vsock->vqs[VSOCK_VQ_RX].vq; - - vhost_transport_do_send_pkt(vsock, vq); -} - -static int -vhost_transport_send_pkt(struct vsock_sock *vsk, - struct virtio_vsock_pkt_info *info) -{ - u32 src_cid, src_port, dst_cid, dst_port; - struct virtio_transport *trans; - struct virtio_vsock_pkt *pkt; - struct vhost_virtqueue *vq; - struct vhost_vsock *vsock; - u32 pkt_len = info->pkt_len; - DEFINE_WAIT(wait); - - src_cid = vhost_transport_get_local_cid(); - src_port = vsk->local_addr.svm_port; - if (!info->remote_cid) { - dst_cid = vsk->remote_addr.svm_cid; - dst_port = vsk->remote_addr.svm_port; - } else { - dst_cid = info->remote_cid; - dst_port = info->remote_port; - } - - /* Find the vhost_vsock according to guest context id */ - vsock = vhost_vsock_get(dst_cid); - if (!vsock) - return -ENODEV; - - trans = vsk->trans; - vq = &vsock->vqs[VSOCK_VQ_RX].vq; - - /* we can send less than pkt_len bytes */ - if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE) - pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; - - /* virtio_transport_get_credit might return less than pkt_len credit */ - pkt_len = virtio_transport_get_credit(trans, pkt_len); - - /* Do not send zero length OP_RW pkt*/ - if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW) - return pkt_len; - - /* Respect global tx buf limitation */ - mutex_lock(&vq->mutex); - while (pkt_len + vsock->total_tx_buf > VIRTIO_VSOCK_MAX_TX_BUF_SIZE) { - prepare_to_wait_exclusive(&vsock->queue_wait, &wait, - TASK_UNINTERRUPTIBLE); - mutex_unlock(&vq->mutex); - schedule(); - mutex_lock(&vq->mutex); - finish_wait(&vsock->queue_wait, &wait); - } - vsock->total_tx_buf += pkt_len; - mutex_unlock(&vq->mutex); - - pkt = virtio_transport_alloc_pkt(vsk, info, pkt_len, - src_cid, src_port, - dst_cid, dst_port); - if (!pkt) { - mutex_lock(&vq->mutex); - vsock->total_tx_buf -= pkt_len; - mutex_unlock(&vq->mutex); - virtio_transport_put_credit(trans, pkt_len); - return -ENOMEM; - } - - pr_debug("%s:info->pkt_len= %d\n", __func__, pkt_len); - /* Released in vhost_transport_do_send_pkt */ - sock_hold(&trans->vsk->sk); - virtio_transport_inc_tx_pkt(pkt); - - /* Queue it up in vhost work */ - mutex_lock(&vq->mutex); - list_add_tail(&pkt->list, &vsock->send_pkt_list); - vhost_work_queue(&vsock->dev, &vsock->send_pkt_work); - mutex_unlock(&vq->mutex); - - return pkt_len; -} - -static struct virtio_transport_pkt_ops vhost_ops = { - .send_pkt = vhost_transport_send_pkt, -}; - -static struct virtio_vsock_pkt * -vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq, - unsigned int out, unsigned int in) -{ - struct virtio_vsock_pkt *pkt; - struct iov_iter iov_iter; - size_t nbytes; - size_t len; - - if (in != 0) { - vq_err(vq, "Expected 0 input buffers, got %u\n", in); - return NULL; - } - - pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); - if (!pkt) - return NULL; - - len = iov_length(vq->iov, out); - iov_iter_init(&iov_iter, WRITE, vq->iov, out, len); - - nbytes = copy_from_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter); - if (nbytes != sizeof(pkt->hdr)) { - vq_err(vq, "Expected %zu bytes for pkt->hdr, got %zu bytes\n", - sizeof(pkt->hdr), nbytes); - kfree(pkt); - return NULL; - } - - if (le16_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_DGRAM) - pkt->len = le32_to_cpu(pkt->hdr.len) & 0XFFFF; - else if (le16_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_STREAM) - pkt->len = le32_to_cpu(pkt->hdr.len); - - /* No payload */ - if (!pkt->len) - return pkt; - - /* The pkt is too big */ - if (pkt->len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) { - kfree(pkt); - return NULL; - } - - pkt->buf = kmalloc(pkt->len, GFP_KERNEL); - if (!pkt->buf) { - kfree(pkt); - return NULL; - } - - nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter); - if (nbytes != pkt->len) { - vq_err(vq, "Expected %u byte payload, got %zu bytes\n", - pkt->len, nbytes); - virtio_transport_free_pkt(pkt); - return NULL; - } - - return pkt; -} - -static void vhost_vsock_handle_ctl_kick(struct vhost_work *work) -{ - struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, - poll.work); - struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock, - dev); - - pr_debug("%s vq=%p, vsock=%p\n", __func__, vq, vsock); -} - -static void vhost_vsock_handle_tx_kick(struct vhost_work *work) -{ - struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, - poll.work); - struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock, - dev); - struct virtio_vsock_pkt *pkt; - int head; - unsigned int out, in; - bool added = false; - u32 len; - - mutex_lock(&vq->mutex); - vhost_disable_notify(&vsock->dev, vq); - for (;;) { - head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), - &out, &in, NULL, NULL); - if (head < 0) - break; - - if (head == vq->num) { - if (unlikely(vhost_enable_notify(&vsock->dev, vq))) { - vhost_disable_notify(&vsock->dev, vq); - continue; - } - break; - } - - pkt = vhost_vsock_alloc_pkt(vq, out, in); - if (!pkt) { - vq_err(vq, "Faulted on pkt\n"); - continue; - } - - len = pkt->len; - - /* Only accept correctly addressed packets */ - if (le32_to_cpu(pkt->hdr.src_cid) == vsock->guest_cid && - le32_to_cpu(pkt->hdr.dst_cid) == vhost_transport_get_local_cid()) - virtio_transport_recv_pkt(pkt); - else - virtio_transport_free_pkt(pkt); - - vhost_add_used(vq, head, len); - added = true; - } - if (added) - vhost_signal(&vsock->dev, vq); - mutex_unlock(&vq->mutex); -} - -static void vhost_vsock_handle_rx_kick(struct vhost_work *work) -{ - struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, - poll.work); - struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock, - dev); - - vhost_transport_do_send_pkt(vsock, vq); -} - -static int vhost_vsock_dev_open(struct inode *inode, struct file *file) -{ - struct vhost_virtqueue **vqs; - struct vhost_vsock *vsock; - int ret; - - vsock = kzalloc(sizeof(*vsock), GFP_KERNEL); - if (!vsock) - return -ENOMEM; - - pr_debug("%s:vsock=%p\n", __func__, vsock); - - vqs = kmalloc(VSOCK_VQ_MAX * sizeof(*vqs), GFP_KERNEL); - if (!vqs) { - ret = -ENOMEM; - goto out; - } - - vqs[VSOCK_VQ_CTRL] = &vsock->vqs[VSOCK_VQ_CTRL].vq; - vqs[VSOCK_VQ_TX] = &vsock->vqs[VSOCK_VQ_TX].vq; - vqs[VSOCK_VQ_RX] = &vsock->vqs[VSOCK_VQ_RX].vq; - vsock->vqs[VSOCK_VQ_CTRL].vq.handle_kick = vhost_vsock_handle_ctl_kick; - vsock->vqs[VSOCK_VQ_TX].vq.handle_kick = vhost_vsock_handle_tx_kick; - vsock->vqs[VSOCK_VQ_RX].vq.handle_kick = vhost_vsock_handle_rx_kick; - - vhost_dev_init(&vsock->dev, vqs, VSOCK_VQ_MAX); - - file->private_data = vsock; - init_waitqueue_head(&vsock->queue_wait); - INIT_LIST_HEAD(&vsock->send_pkt_list); - vhost_work_init(&vsock->send_pkt_work, vhost_transport_send_pkt_work); - - mutex_lock(&vhost_vsock_mutex); - list_add_tail(&vsock->list, &vhost_vsock_list); - mutex_unlock(&vhost_vsock_mutex); - return 0; - -out: - kfree(vsock); - return ret; -} - -static void vhost_vsock_flush(struct vhost_vsock *vsock) -{ - int i; - - for (i = 0; i < VSOCK_VQ_MAX; i++) - vhost_poll_flush(&vsock->vqs[i].vq.poll); - vhost_work_flush(&vsock->dev, &vsock->send_pkt_work); -} - -static int vhost_vsock_dev_release(struct inode *inode, struct file *file) -{ - struct vhost_vsock *vsock = file->private_data; - - mutex_lock(&vhost_vsock_mutex); - list_del(&vsock->list); - mutex_unlock(&vhost_vsock_mutex); - - vhost_dev_stop(&vsock->dev); - vhost_vsock_flush(vsock); - vhost_dev_cleanup(&vsock->dev, false); - kfree(vsock->dev.vqs); - kfree(vsock); - return 0; -} - -static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u32 guest_cid) -{ - struct vhost_vsock *other; - - /* Refuse reserved CIDs */ - if (guest_cid <= VMADDR_CID_HOST) { - return -EINVAL; - } - - /* Refuse if CID is already in use */ - other = vhost_vsock_get(guest_cid); - if (other && other != vsock) { - return -EADDRINUSE; - } - - mutex_lock(&vhost_vsock_mutex); - vsock->guest_cid = guest_cid; - pr_debug("%s:guest_cid=%d\n", __func__, guest_cid); - mutex_unlock(&vhost_vsock_mutex); - - return 0; -} - -static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features) -{ - struct vhost_virtqueue *vq; - int i; - - if (features & ~VHOST_VSOCK_FEATURES) - return -EOPNOTSUPP; - - mutex_lock(&vsock->dev.mutex); - if ((features & (1 << VHOST_F_LOG_ALL)) && - !vhost_log_access_ok(&vsock->dev)) { - mutex_unlock(&vsock->dev.mutex); - return -EFAULT; - } - - for (i = 0; i < VSOCK_VQ_MAX; i++) { - vq = &vsock->vqs[i].vq; - mutex_lock(&vq->mutex); - vq->acked_features = features; - mutex_unlock(&vq->mutex); - } - mutex_unlock(&vsock->dev.mutex); - return 0; -} - -static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl, - unsigned long arg) -{ - struct vhost_vsock *vsock = f->private_data; - void __user *argp = (void __user *)arg; - u64 __user *featurep = argp; - u32 __user *cidp = argp; - u32 guest_cid; - u64 features; - int r; - - switch (ioctl) { - case VHOST_VSOCK_SET_GUEST_CID: - if (get_user(guest_cid, cidp)) - return -EFAULT; - return vhost_vsock_set_cid(vsock, guest_cid); - case VHOST_GET_FEATURES: - features = VHOST_VSOCK_FEATURES; - if (copy_to_user(featurep, &features, sizeof(features))) - return -EFAULT; - return 0; - case VHOST_SET_FEATURES: - if (copy_from_user(&features, featurep, sizeof(features))) - return -EFAULT; - return vhost_vsock_set_features(vsock, features); - default: - mutex_lock(&vsock->dev.mutex); - r = vhost_dev_ioctl(&vsock->dev, ioctl, argp); - if (r == -ENOIOCTLCMD) - r = vhost_vring_ioctl(&vsock->dev, ioctl, argp); - else - vhost_vsock_flush(vsock); - mutex_unlock(&vsock->dev.mutex); - return r; - } -} - -static const struct file_operations vhost_vsock_fops = { - .owner = THIS_MODULE, - .open = vhost_vsock_dev_open, - .release = vhost_vsock_dev_release, - .llseek = noop_llseek, - .unlocked_ioctl = vhost_vsock_dev_ioctl, -}; - -static struct miscdevice vhost_vsock_misc = { - .minor = MISC_DYNAMIC_MINOR, - .name = "vhost-vsock", - .fops = &vhost_vsock_fops, -}; - -static int -vhost_transport_socket_init(struct vsock_sock *vsk, struct vsock_sock *psk) -{ - struct virtio_transport *trans; - int ret; - - ret = virtio_transport_do_socket_init(vsk, psk); - if (ret) - return ret; - - trans = vsk->trans; - trans->ops = &vhost_ops; - - return ret; -} - -static struct vsock_transport vhost_transport = { - .get_local_cid = vhost_transport_get_local_cid, - - .init = vhost_transport_socket_init, - .destruct = virtio_transport_destruct, - .release = virtio_transport_release, - .connect = virtio_transport_connect, - .shutdown = virtio_transport_shutdown, - - .dgram_enqueue = virtio_transport_dgram_enqueue, - .dgram_dequeue = virtio_transport_dgram_dequeue, - .dgram_bind = virtio_transport_dgram_bind, - .dgram_allow = virtio_transport_dgram_allow, - - .stream_enqueue = virtio_transport_stream_enqueue, - .stream_dequeue = virtio_transport_stream_dequeue, - .stream_has_data = virtio_transport_stream_has_data, - .stream_has_space = virtio_transport_stream_has_space, - .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, - .stream_is_active = virtio_transport_stream_is_active, - .stream_allow = virtio_transport_stream_allow, - - .notify_poll_in = virtio_transport_notify_poll_in, - .notify_poll_out = virtio_transport_notify_poll_out, - .notify_recv_init = virtio_transport_notify_recv_init, - .notify_recv_pre_block = virtio_transport_notify_recv_pre_block, - .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue, - .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue, - .notify_send_init = virtio_transport_notify_send_init, - .notify_send_pre_block = virtio_transport_notify_send_pre_block, - .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, - .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, - - .set_buffer_size = virtio_transport_set_buffer_size, - .set_min_buffer_size = virtio_transport_set_min_buffer_size, - .set_max_buffer_size = virtio_transport_set_max_buffer_size, - .get_buffer_size = virtio_transport_get_buffer_size, - .get_min_buffer_size = virtio_transport_get_min_buffer_size, - .get_max_buffer_size = virtio_transport_get_max_buffer_size, -}; - -static int __init vhost_vsock_init(void) -{ - int ret; - - ret = vsock_core_init(&vhost_transport); - if (ret < 0) - return ret; - return misc_register(&vhost_vsock_misc); -}; - -static void __exit vhost_vsock_exit(void) -{ - misc_deregister(&vhost_vsock_misc); - vsock_core_exit(); -}; - -module_init(vhost_vsock_init); -module_exit(vhost_vsock_exit); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Asias He"); -MODULE_DESCRIPTION("vhost transport for vsock "); diff --git a/drivers/vhost/vsock.h b/drivers/vhost/vsock.h deleted file mode 100644 index 0ddb107b86ca..000000000000 --- a/drivers/vhost/vsock.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef VHOST_VSOCK_H -#define VHOST_VSOCK_H -#define VHOST_VSOCK_SET_GUEST_CID _IOW(VHOST_VIRTIO, 0x60, __u32) -#endif diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h deleted file mode 100644 index a5f3ecc038f7..000000000000 --- a/include/linux/virtio_vsock.h +++ /dev/null @@ -1,209 +0,0 @@ -/* - * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so - * anyone can use the definitions to implement compatible drivers/servers: - * - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of IBM nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * Copyright (C) Red Hat, Inc., 2013-2015 - * Copyright (C) Asias He , 2013 - * Copyright (C) Stefan Hajnoczi , 2015 - */ - -#ifndef _LINUX_VIRTIO_VSOCK_H -#define _LINUX_VIRTIO_VSOCK_H - -#include -#include -#include - -#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE 128 -#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE (1024 * 256) -#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE (1024 * 256) -#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4) -#define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL -#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) -#define VIRTIO_VSOCK_MAX_TX_BUF_SIZE (1024 * 1024 * 16) -#define VIRTIO_VSOCK_MAX_DGRAM_SIZE (1024 * 64) - -struct vsock_transport_recv_notify_data; -struct vsock_transport_send_notify_data; -struct sockaddr_vm; -struct vsock_sock; - -enum { - VSOCK_VQ_CTRL = 0, - VSOCK_VQ_RX = 1, /* for host to guest data */ - VSOCK_VQ_TX = 2, /* for guest to host data */ - VSOCK_VQ_MAX = 3, -}; - -/* virtio transport socket state */ -struct virtio_transport { - struct virtio_transport_pkt_ops *ops; - struct vsock_sock *vsk; - - u32 buf_size; - u32 buf_size_min; - u32 buf_size_max; - - struct mutex tx_lock; - struct mutex rx_lock; - - struct list_head rx_queue; - u32 rx_bytes; - - /* Protected by trans->tx_lock */ - u32 tx_cnt; - u32 buf_alloc; - u32 peer_fwd_cnt; - u32 peer_buf_alloc; - /* Protected by trans->rx_lock */ - u32 fwd_cnt; - - /* Protected by sk_lock */ - u16 dgram_id; - struct list_head incomplete_dgrams; /* dgram fragments */ -}; - -struct virtio_vsock_pkt { - struct virtio_vsock_hdr hdr; - struct virtio_transport *trans; - struct work_struct work; - struct list_head list; - void *buf; - u32 len; - u32 off; -}; - -struct virtio_vsock_pkt_info { - u32 remote_cid, remote_port; - struct msghdr *msg; - u32 pkt_len; - u16 type; - u16 op; - u32 flags; - u16 dgram_id; - u16 dgram_len; -}; - -struct virtio_transport_pkt_ops { - int (*send_pkt)(struct vsock_sock *vsk, - struct virtio_vsock_pkt_info *info); -}; - -void virtio_vsock_dumppkt(const char *func, - const struct virtio_vsock_pkt *pkt); - -struct sock * -virtio_transport_get_pending(struct sock *listener, - struct virtio_vsock_pkt *pkt); -struct virtio_vsock_pkt * -virtio_transport_alloc_pkt(struct vsock_sock *vsk, - struct virtio_vsock_pkt_info *info, - size_t len, - u32 src_cid, - u32 src_port, - u32 dst_cid, - u32 dst_port); -ssize_t -virtio_transport_stream_dequeue(struct vsock_sock *vsk, - struct msghdr *msg, - size_t len, - int type); -int -virtio_transport_dgram_dequeue(struct vsock_sock *vsk, - struct msghdr *msg, - size_t len, int flags); - -s64 virtio_transport_stream_has_data(struct vsock_sock *vsk); -s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); - -int virtio_transport_do_socket_init(struct vsock_sock *vsk, - struct vsock_sock *psk); -u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk); -u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk); -u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk); -void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val); -void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val); -void virtio_transport_set_max_buffer_size(struct vsock_sock *vs, u64 val); -int -virtio_transport_notify_poll_in(struct vsock_sock *vsk, - size_t target, - bool *data_ready_now); -int -virtio_transport_notify_poll_out(struct vsock_sock *vsk, - size_t target, - bool *space_available_now); - -int virtio_transport_notify_recv_init(struct vsock_sock *vsk, - size_t target, struct vsock_transport_recv_notify_data *data); -int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk, - size_t target, struct vsock_transport_recv_notify_data *data); -int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk, - size_t target, struct vsock_transport_recv_notify_data *data); -int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk, - size_t target, ssize_t copied, bool data_read, - struct vsock_transport_recv_notify_data *data); -int virtio_transport_notify_send_init(struct vsock_sock *vsk, - struct vsock_transport_send_notify_data *data); -int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk, - struct vsock_transport_send_notify_data *data); -int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, - struct vsock_transport_send_notify_data *data); -int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, - ssize_t written, struct vsock_transport_send_notify_data *data); - -u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk); -bool virtio_transport_stream_is_active(struct vsock_sock *vsk); -bool virtio_transport_stream_allow(u32 cid, u32 port); -int virtio_transport_dgram_bind(struct vsock_sock *vsk, - struct sockaddr_vm *addr); -bool virtio_transport_dgram_allow(u32 cid, u32 port); - -int virtio_transport_connect(struct vsock_sock *vsk); - -int virtio_transport_shutdown(struct vsock_sock *vsk, int mode); - -void virtio_transport_release(struct vsock_sock *vsk); - -ssize_t -virtio_transport_stream_enqueue(struct vsock_sock *vsk, - struct msghdr *msg, - size_t len); -int -virtio_transport_dgram_enqueue(struct vsock_sock *vsk, - struct sockaddr_vm *remote_addr, - struct msghdr *msg, - size_t len); - -void virtio_transport_destruct(struct vsock_sock *vsk); - -void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt); -void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt); -void virtio_transport_inc_tx_pkt(struct virtio_vsock_pkt *pkt); -void virtio_transport_dec_tx_pkt(struct virtio_vsock_pkt *pkt); -u32 virtio_transport_get_credit(struct virtio_transport *trans, u32 wanted); -void virtio_transport_put_credit(struct virtio_transport *trans, u32 credit); -#endif /* _LINUX_VIRTIO_VSOCK_H */ diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index a0c8fa2ababf..e9eb2d6791b3 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -175,10 +175,8 @@ void vsock_insert_connected(struct vsock_sock *vsk); void vsock_remove_bound(struct vsock_sock *vsk); void vsock_remove_connected(struct vsock_sock *vsk); struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); -struct sock *vsock_find_unbound_socket(struct sockaddr_vm *addr); struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst); void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)); -int vsock_bind_dgram_generic(struct vsock_sock *vsk, struct sockaddr_vm *addr); #endif /* __AF_VSOCK_H__ */ diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index 16dcf5d06cd7..77925f587b15 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -39,7 +39,6 @@ #define VIRTIO_ID_9P 9 /* 9p virtio console */ #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ #define VIRTIO_ID_CAIF 12 /* Virtio caif */ -#define VIRTIO_ID_VSOCK 13 /* virtio vsock transport */ #define VIRTIO_ID_GPU 16 /* virtio GPU */ #define VIRTIO_ID_INPUT 18 /* virtio input */ diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h deleted file mode 100644 index 8cf9b5682628..000000000000 --- a/include/uapi/linux/virtio_vsock.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so - * anyone can use the definitions to implement compatible drivers/servers: - * - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of IBM nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * Copyright (C) Red Hat, Inc., 2013-2015 - * Copyright (C) Asias He , 2013 - * Copyright (C) Stefan Hajnoczi , 2015 - */ - -#ifndef _UAPI_LINUX_VIRTIO_VSOCK_H -#define _UAPI_LINUX_VIRTIO_VOSCK_H - -#include -#include -#include - -struct virtio_vsock_config { - __le32 guest_cid; - __le32 max_virtqueue_pairs; -}; - -struct virtio_vsock_hdr { - __le32 src_cid; - __le32 src_port; - __le32 dst_cid; - __le32 dst_port; - __le32 len; - __le16 type; /* enum virtio_vsock_type */ - __le16 op; /* enum virtio_vsock_op */ - __le32 flags; - __le32 buf_alloc; - __le32 fwd_cnt; -}; - -enum virtio_vsock_type { - VIRTIO_VSOCK_TYPE_STREAM = 1, - VIRTIO_VSOCK_TYPE_DGRAM = 2, -}; - -enum virtio_vsock_op { - VIRTIO_VSOCK_OP_INVALID = 0, - - /* Connect operations */ - VIRTIO_VSOCK_OP_REQUEST = 1, - VIRTIO_VSOCK_OP_RESPONSE = 2, - VIRTIO_VSOCK_OP_ACK = 3, - VIRTIO_VSOCK_OP_RST = 4, - VIRTIO_VSOCK_OP_SHUTDOWN = 5, - - /* To send payload */ - VIRTIO_VSOCK_OP_RW = 6, - - /* Tell the peer our credit info */ - VIRTIO_VSOCK_OP_CREDIT_UPDATE = 7, - /* Request the peer to send the credit info to us */ - VIRTIO_VSOCK_OP_CREDIT_REQUEST = 8, -}; - -/* VIRTIO_VSOCK_OP_SHUTDOWN flags values */ -enum virtio_vsock_shutdown { - VIRTIO_VSOCK_SHUTDOWN_RCV = 1, - VIRTIO_VSOCK_SHUTDOWN_SEND = 2, -}; - -#endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */ diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig index 74e0bc887a33..14810abedc2e 100644 --- a/net/vmw_vsock/Kconfig +++ b/net/vmw_vsock/Kconfig @@ -26,21 +26,3 @@ config VMWARE_VMCI_VSOCKETS To compile this driver as a module, choose M here: the module will be called vmw_vsock_vmci_transport. If unsure, say N. - -config VIRTIO_VSOCKETS - tristate "virtio transport for Virtual Sockets" - depends on VSOCKETS && VIRTIO - select VIRTIO_VSOCKETS_COMMON - help - This module implements a virtio transport for Virtual Sockets. - - Enable this transport if your Virtual Machine runs on Qemu/KVM. - - To compile this driver as a module, choose M here: the module - will be called virtio_vsock_transport. If unsure, say N. - -config VIRTIO_VSOCKETS_COMMON - tristate - ---help--- - This option is selected by any driver which needs to access - the virtio_vsock. diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile index cf4c29439081..2ce52d70f224 100644 --- a/net/vmw_vsock/Makefile +++ b/net/vmw_vsock/Makefile @@ -1,7 +1,5 @@ obj-$(CONFIG_VSOCKETS) += vsock.o obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o -obj-$(CONFIG_VIRTIO_VSOCKETS) += virtio_transport.o -obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += virtio_transport_common.o vsock-y += af_vsock.o vsock_addr.o diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 77247a2b670b..7fd1220fbfa0 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -223,17 +223,6 @@ static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr) return NULL; } -static struct sock *__vsock_find_unbound_socket(struct sockaddr_vm *addr) -{ - struct vsock_sock *vsk; - - list_for_each_entry(vsk, vsock_unbound_sockets, bound_table) - if (addr->svm_port == vsk->local_addr.svm_port) - return sk_vsock(vsk); - - return NULL; -} - static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst) { @@ -309,21 +298,6 @@ struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr) } EXPORT_SYMBOL_GPL(vsock_find_bound_socket); -struct sock *vsock_find_unbound_socket(struct sockaddr_vm *addr) -{ - struct sock *sk; - - spin_lock_bh(&vsock_table_lock); - sk = __vsock_find_unbound_socket(addr); - if (sk) - sock_hold(sk); - - spin_unlock_bh(&vsock_table_lock); - - return sk; -} -EXPORT_SYMBOL_GPL(vsock_find_unbound_socket); - struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst) { @@ -558,50 +532,6 @@ static int __vsock_bind_stream(struct vsock_sock *vsk, return 0; } -int vsock_bind_dgram_generic(struct vsock_sock *vsk, struct sockaddr_vm *addr) -{ - static u32 port = LAST_RESERVED_PORT + 1; - struct sockaddr_vm new_addr; - - vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port); - - if (addr->svm_port == VMADDR_PORT_ANY) { - bool found = false; - unsigned int i; - - for (i = 0; i < MAX_PORT_RETRIES; i++) { - if (port <= LAST_RESERVED_PORT) - port = LAST_RESERVED_PORT + 1; - - new_addr.svm_port = port++; - - if (!__vsock_find_unbound_socket(&new_addr)) { - found = true; - break; - } - } - - if (!found) - return -EADDRNOTAVAIL; - } else { - /* If port is in reserved range, ensure caller - * has necessary privileges. - */ - if (addr->svm_port <= LAST_RESERVED_PORT && - !capable(CAP_NET_BIND_SERVICE)) { - return -EACCES; - } - - if (__vsock_find_unbound_socket(&new_addr)) - return -EADDRINUSE; - } - - vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port); - - return 0; -} -EXPORT_SYMBOL_GPL(vsock_bind_dgram_generic); - static int __vsock_bind_dgram(struct vsock_sock *vsk, struct sockaddr_vm *addr) { diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c deleted file mode 100644 index df65dca55fa1..000000000000 --- a/net/vmw_vsock/virtio_transport.c +++ /dev/null @@ -1,466 +0,0 @@ -/* - * virtio transport for vsock - * - * Copyright (C) 2013-2015 Red Hat, Inc. - * Author: Asias He - * Stefan Hajnoczi - * - * Some of the code is take from Gerd Hoffmann 's - * early virtio-vsock proof-of-concept bits. - * - * This work is licensed under the terms of the GNU GPL, version 2. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static struct workqueue_struct *virtio_vsock_workqueue; -static struct virtio_vsock *the_virtio_vsock; -static DEFINE_MUTEX(the_virtio_vsock_mutex); /* protects the_virtio_vsock */ -static void virtio_vsock_rx_fill(struct virtio_vsock *vsock); - -struct virtio_vsock { - /* Virtio device */ - struct virtio_device *vdev; - /* Virtio virtqueue */ - struct virtqueue *vqs[VSOCK_VQ_MAX]; - /* Wait queue for send pkt */ - wait_queue_head_t queue_wait; - /* Work item to send pkt */ - struct work_struct tx_work; - /* Work item to recv pkt */ - struct work_struct rx_work; - /* Mutex to protect send pkt*/ - struct mutex tx_lock; - /* Mutex to protect recv pkt*/ - struct mutex rx_lock; - /* Number of recv buffers */ - int rx_buf_nr; - /* Number of max recv buffers */ - int rx_buf_max_nr; - /* Used for global tx buf limitation */ - u32 total_tx_buf; - /* Guest context id, just like guest ip address */ - u32 guest_cid; -}; - -static struct virtio_vsock *virtio_vsock_get(void) -{ - return the_virtio_vsock; -} - -static u32 virtio_transport_get_local_cid(void) -{ - struct virtio_vsock *vsock = virtio_vsock_get(); - - return vsock->guest_cid; -} - -static int -virtio_transport_send_pkt(struct vsock_sock *vsk, - struct virtio_vsock_pkt_info *info) -{ - u32 src_cid, src_port, dst_cid, dst_port; - int ret, in_sg = 0, out_sg = 0; - struct virtio_transport *trans; - struct virtio_vsock_pkt *pkt; - struct virtio_vsock *vsock; - struct scatterlist hdr, buf, *sgs[2]; - struct virtqueue *vq; - u32 pkt_len = info->pkt_len; - DEFINE_WAIT(wait); - - vsock = virtio_vsock_get(); - if (!vsock) - return -ENODEV; - - src_cid = virtio_transport_get_local_cid(); - src_port = vsk->local_addr.svm_port; - if (!info->remote_cid) { - dst_cid = vsk->remote_addr.svm_cid; - dst_port = vsk->remote_addr.svm_port; - } else { - dst_cid = info->remote_cid; - dst_port = info->remote_port; - } - - trans = vsk->trans; - vq = vsock->vqs[VSOCK_VQ_TX]; - - if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE) - pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; - pkt_len = virtio_transport_get_credit(trans, pkt_len); - /* Do not send zero length OP_RW pkt*/ - if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW) - return pkt_len; - - /* Respect global tx buf limitation */ - mutex_lock(&vsock->tx_lock); - while (pkt_len + vsock->total_tx_buf > VIRTIO_VSOCK_MAX_TX_BUF_SIZE) { - prepare_to_wait_exclusive(&vsock->queue_wait, &wait, - TASK_UNINTERRUPTIBLE); - mutex_unlock(&vsock->tx_lock); - schedule(); - mutex_lock(&vsock->tx_lock); - finish_wait(&vsock->queue_wait, &wait); - } - vsock->total_tx_buf += pkt_len; - mutex_unlock(&vsock->tx_lock); - - pkt = virtio_transport_alloc_pkt(vsk, info, pkt_len, - src_cid, src_port, - dst_cid, dst_port); - if (!pkt) { - mutex_lock(&vsock->tx_lock); - vsock->total_tx_buf -= pkt_len; - mutex_unlock(&vsock->tx_lock); - virtio_transport_put_credit(trans, pkt_len); - return -ENOMEM; - } - - pr_debug("%s:info->pkt_len= %d\n", __func__, info->pkt_len); - - /* Will be released in virtio_transport_send_pkt_work */ - sock_hold(&trans->vsk->sk); - virtio_transport_inc_tx_pkt(pkt); - - /* Put pkt in the virtqueue */ - sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr)); - sgs[out_sg++] = &hdr; - if (info->msg && info->pkt_len > 0) { - sg_init_one(&buf, pkt->buf, pkt->len); - sgs[out_sg++] = &buf; - } - - mutex_lock(&vsock->tx_lock); - while ((ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, pkt, - GFP_KERNEL)) < 0) { - prepare_to_wait_exclusive(&vsock->queue_wait, &wait, - TASK_UNINTERRUPTIBLE); - mutex_unlock(&vsock->tx_lock); - schedule(); - mutex_lock(&vsock->tx_lock); - finish_wait(&vsock->queue_wait, &wait); - } - virtqueue_kick(vq); - mutex_unlock(&vsock->tx_lock); - - return pkt_len; -} - -static struct virtio_transport_pkt_ops virtio_ops = { - .send_pkt = virtio_transport_send_pkt, -}; - -static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) -{ - int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; - struct virtio_vsock_pkt *pkt; - struct scatterlist hdr, buf, *sgs[2]; - struct virtqueue *vq; - int ret; - - vq = vsock->vqs[VSOCK_VQ_RX]; - - do { - pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); - if (!pkt) { - pr_debug("%s: fail to allocate pkt\n", __func__); - goto out; - } - - /* TODO: use mergeable rx buffer */ - pkt->buf = kmalloc(buf_len, GFP_KERNEL); - if (!pkt->buf) { - pr_debug("%s: fail to allocate pkt->buf\n", __func__); - goto err; - } - - sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr)); - sgs[0] = &hdr; - - sg_init_one(&buf, pkt->buf, buf_len); - sgs[1] = &buf; - ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL); - if (ret) - goto err; - vsock->rx_buf_nr++; - } while (vq->num_free); - if (vsock->rx_buf_nr > vsock->rx_buf_max_nr) - vsock->rx_buf_max_nr = vsock->rx_buf_nr; -out: - virtqueue_kick(vq); - return; -err: - virtqueue_kick(vq); - virtio_transport_free_pkt(pkt); - return; -} - -static void virtio_transport_send_pkt_work(struct work_struct *work) -{ - struct virtio_vsock *vsock = - container_of(work, struct virtio_vsock, tx_work); - struct virtio_vsock_pkt *pkt; - bool added = false; - struct virtqueue *vq; - unsigned int len; - struct sock *sk; - - vq = vsock->vqs[VSOCK_VQ_TX]; - mutex_lock(&vsock->tx_lock); - do { - virtqueue_disable_cb(vq); - while ((pkt = virtqueue_get_buf(vq, &len)) != NULL) { - sk = &pkt->trans->vsk->sk; - virtio_transport_dec_tx_pkt(pkt); - /* Release refcnt taken in virtio_transport_send_pkt */ - sock_put(sk); - vsock->total_tx_buf -= pkt->len; - virtio_transport_free_pkt(pkt); - added = true; - } - } while (!virtqueue_enable_cb(vq)); - mutex_unlock(&vsock->tx_lock); - - if (added) - wake_up(&vsock->queue_wait); -} - -static void virtio_transport_recv_pkt_work(struct work_struct *work) -{ - struct virtio_vsock *vsock = - container_of(work, struct virtio_vsock, rx_work); - struct virtio_vsock_pkt *pkt; - struct virtqueue *vq; - unsigned int len; - - vq = vsock->vqs[VSOCK_VQ_RX]; - mutex_lock(&vsock->rx_lock); - do { - virtqueue_disable_cb(vq); - while ((pkt = virtqueue_get_buf(vq, &len)) != NULL) { - pkt->len = len; - virtio_transport_recv_pkt(pkt); - vsock->rx_buf_nr--; - } - } while (!virtqueue_enable_cb(vq)); - - if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2) - virtio_vsock_rx_fill(vsock); - mutex_unlock(&vsock->rx_lock); -} - -static void virtio_vsock_ctrl_done(struct virtqueue *vq) -{ -} - -static void virtio_vsock_tx_done(struct virtqueue *vq) -{ - struct virtio_vsock *vsock = vq->vdev->priv; - - if (!vsock) - return; - queue_work(virtio_vsock_workqueue, &vsock->tx_work); -} - -static void virtio_vsock_rx_done(struct virtqueue *vq) -{ - struct virtio_vsock *vsock = vq->vdev->priv; - - if (!vsock) - return; - queue_work(virtio_vsock_workqueue, &vsock->rx_work); -} - -static int -virtio_transport_socket_init(struct vsock_sock *vsk, struct vsock_sock *psk) -{ - struct virtio_transport *trans; - int ret; - - ret = virtio_transport_do_socket_init(vsk, psk); - if (ret) - return ret; - - trans = vsk->trans; - trans->ops = &virtio_ops; - return ret; -} - -static struct vsock_transport virtio_transport = { - .get_local_cid = virtio_transport_get_local_cid, - - .init = virtio_transport_socket_init, - .destruct = virtio_transport_destruct, - .release = virtio_transport_release, - .connect = virtio_transport_connect, - .shutdown = virtio_transport_shutdown, - - .dgram_bind = virtio_transport_dgram_bind, - .dgram_dequeue = virtio_transport_dgram_dequeue, - .dgram_enqueue = virtio_transport_dgram_enqueue, - .dgram_allow = virtio_transport_dgram_allow, - - .stream_dequeue = virtio_transport_stream_dequeue, - .stream_enqueue = virtio_transport_stream_enqueue, - .stream_has_data = virtio_transport_stream_has_data, - .stream_has_space = virtio_transport_stream_has_space, - .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, - .stream_is_active = virtio_transport_stream_is_active, - .stream_allow = virtio_transport_stream_allow, - - .notify_poll_in = virtio_transport_notify_poll_in, - .notify_poll_out = virtio_transport_notify_poll_out, - .notify_recv_init = virtio_transport_notify_recv_init, - .notify_recv_pre_block = virtio_transport_notify_recv_pre_block, - .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue, - .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue, - .notify_send_init = virtio_transport_notify_send_init, - .notify_send_pre_block = virtio_transport_notify_send_pre_block, - .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, - .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, - - .set_buffer_size = virtio_transport_set_buffer_size, - .set_min_buffer_size = virtio_transport_set_min_buffer_size, - .set_max_buffer_size = virtio_transport_set_max_buffer_size, - .get_buffer_size = virtio_transport_get_buffer_size, - .get_min_buffer_size = virtio_transport_get_min_buffer_size, - .get_max_buffer_size = virtio_transport_get_max_buffer_size, -}; - -static int virtio_vsock_probe(struct virtio_device *vdev) -{ - vq_callback_t *callbacks[] = { - virtio_vsock_ctrl_done, - virtio_vsock_rx_done, - virtio_vsock_tx_done, - }; - const char *names[] = { - "ctrl", - "rx", - "tx", - }; - struct virtio_vsock *vsock = NULL; - u32 guest_cid; - int ret; - - ret = mutex_lock_interruptible(&the_virtio_vsock_mutex); - if (ret) - return ret; - - /* Only one virtio-vsock device per guest is supported */ - if (the_virtio_vsock) { - ret = -EBUSY; - goto out; - } - - vsock = kzalloc(sizeof(*vsock), GFP_KERNEL); - if (!vsock) { - ret = -ENOMEM; - goto out; - } - - vsock->vdev = vdev; - - ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX, - vsock->vqs, callbacks, names); - if (ret < 0) - goto out; - - vdev->config->get(vdev, offsetof(struct virtio_vsock_config, guest_cid), - &guest_cid, sizeof(guest_cid)); - vsock->guest_cid = le32_to_cpu(guest_cid); - pr_debug("%s:guest_cid=%d\n", __func__, vsock->guest_cid); - - ret = vsock_core_init(&virtio_transport); - if (ret < 0) - goto out_vqs; - - vsock->rx_buf_nr = 0; - vsock->rx_buf_max_nr = 0; - - vdev->priv = the_virtio_vsock = vsock; - init_waitqueue_head(&vsock->queue_wait); - mutex_init(&vsock->tx_lock); - mutex_init(&vsock->rx_lock); - INIT_WORK(&vsock->rx_work, virtio_transport_recv_pkt_work); - INIT_WORK(&vsock->tx_work, virtio_transport_send_pkt_work); - - mutex_lock(&vsock->rx_lock); - virtio_vsock_rx_fill(vsock); - mutex_unlock(&vsock->rx_lock); - - mutex_unlock(&the_virtio_vsock_mutex); - return 0; - -out_vqs: - vsock->vdev->config->del_vqs(vsock->vdev); -out: - kfree(vsock); - mutex_unlock(&the_virtio_vsock_mutex); - return ret; -} - -static void virtio_vsock_remove(struct virtio_device *vdev) -{ - struct virtio_vsock *vsock = vdev->priv; - - mutex_lock(&the_virtio_vsock_mutex); - the_virtio_vsock = NULL; - vsock_core_exit(); - mutex_unlock(&the_virtio_vsock_mutex); - - kfree(vsock); -} - -static struct virtio_device_id id_table[] = { - { VIRTIO_ID_VSOCK, VIRTIO_DEV_ANY_ID }, - { 0 }, -}; - -static unsigned int features[] = { -}; - -static struct virtio_driver virtio_vsock_driver = { - .feature_table = features, - .feature_table_size = ARRAY_SIZE(features), - .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, - .id_table = id_table, - .probe = virtio_vsock_probe, - .remove = virtio_vsock_remove, -}; - -static int __init virtio_vsock_init(void) -{ - int ret; - - virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", 0, 0); - if (!virtio_vsock_workqueue) - return -ENOMEM; - ret = register_virtio_driver(&virtio_vsock_driver); - if (ret) - destroy_workqueue(virtio_vsock_workqueue); - return ret; -} - -static void __exit virtio_vsock_exit(void) -{ - unregister_virtio_driver(&virtio_vsock_driver); - destroy_workqueue(virtio_vsock_workqueue); -} - -module_init(virtio_vsock_init); -module_exit(virtio_vsock_exit); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Asias He"); -MODULE_DESCRIPTION("virtio transport for vsock"); -MODULE_DEVICE_TABLE(virtio, id_table); diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c deleted file mode 100644 index 28f790da6f15..000000000000 --- a/net/vmw_vsock/virtio_transport_common.c +++ /dev/null @@ -1,1272 +0,0 @@ -/* - * common code for virtio vsock - * - * Copyright (C) 2013-2015 Red Hat, Inc. - * Author: Asias He - * Stefan Hajnoczi - * - * This work is licensed under the terms of the GNU GPL, version 2. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#define COOKIEBITS 24 -#define COOKIEMASK (((u32)1 << COOKIEBITS) - 1) -#define VSOCK_TIMEOUT_INIT 4 - -#define SHA_MESSAGE_WORDS 16 -#define SHA_VSOCK_WORDS 5 - -static u32 vsockcookie_secret[2][SHA_MESSAGE_WORDS - SHA_VSOCK_WORDS + - SHA_DIGEST_WORDS]; - -static DEFINE_PER_CPU(__u32[SHA_MESSAGE_WORDS + SHA_DIGEST_WORDS + - SHA_WORKSPACE_WORDS], vsock_cookie_scratch); - -static u32 cookie_hash(u32 saddr, u32 daddr, u16 sport, u16 dport, - u32 count, int c) -{ - __u32 *tmp = this_cpu_ptr(vsock_cookie_scratch); - - memcpy(tmp + SHA_VSOCK_WORDS, vsockcookie_secret[c], - sizeof(vsockcookie_secret[c])); - tmp[0] = saddr; - tmp[1] = daddr; - tmp[2] = sport; - tmp[3] = dport; - tmp[4] = count; - sha_transform(tmp + SHA_MESSAGE_WORDS, (__u8 *)tmp, - tmp + SHA_MESSAGE_WORDS + SHA_DIGEST_WORDS); - - return tmp[17]; -} - -static u32 -virtio_vsock_secure_cookie(u32 saddr, u32 daddr, u32 sport, u32 dport, - u32 count) -{ - u32 h1, h2; - - h1 = cookie_hash(saddr, daddr, sport, dport, 0, 0); - h2 = cookie_hash(saddr, daddr, sport, dport, count, 1); - - return h1 + (count << COOKIEBITS) + (h2 & COOKIEMASK); -} - -static u32 -virtio_vsock_check_cookie(u32 saddr, u32 daddr, u32 sport, u32 dport, - u32 count, u32 cookie, u32 maxdiff) -{ - u32 diff; - u32 ret; - - cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0); - - diff = (count - (cookie >> COOKIEBITS)) & ((u32)-1 >> COOKIEBITS); - pr_debug("%s: diff=%x\n", __func__, diff); - if (diff >= maxdiff) - return (u32)-1; - - ret = (cookie - - cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) - & COOKIEMASK; - pr_debug("%s: ret=%x\n", __func__, diff); - - return ret; -} - -void virtio_vsock_dumppkt(const char *func, const struct virtio_vsock_pkt *pkt) -{ - pr_debug("%s: pkt=%p, op=%d, len=%d, %d:%d---%d:%d, len=%d\n", - func, pkt, - le16_to_cpu(pkt->hdr.op), - le32_to_cpu(pkt->hdr.len), - le32_to_cpu(pkt->hdr.src_cid), - le32_to_cpu(pkt->hdr.src_port), - le32_to_cpu(pkt->hdr.dst_cid), - le32_to_cpu(pkt->hdr.dst_port), - pkt->len); -} -EXPORT_SYMBOL_GPL(virtio_vsock_dumppkt); - -struct virtio_vsock_pkt * -virtio_transport_alloc_pkt(struct vsock_sock *vsk, - struct virtio_vsock_pkt_info *info, - size_t len, - u32 src_cid, - u32 src_port, - u32 dst_cid, - u32 dst_port) -{ - struct virtio_transport *trans = vsk->trans; - struct virtio_vsock_pkt *pkt; - int err; - - BUG_ON(!trans); - - pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); - if (!pkt) - return NULL; - - pkt->hdr.type = cpu_to_le16(info->type); - pkt->hdr.op = cpu_to_le16(info->op); - pkt->hdr.src_cid = cpu_to_le32(src_cid); - pkt->hdr.src_port = cpu_to_le32(src_port); - pkt->hdr.dst_cid = cpu_to_le32(dst_cid); - pkt->hdr.dst_port = cpu_to_le32(dst_port); - pkt->hdr.flags = cpu_to_le32(info->flags); - pkt->len = len; - pkt->trans = trans; - if (info->type == VIRTIO_VSOCK_TYPE_DGRAM) - pkt->hdr.len = cpu_to_le32(len + (info->dgram_len << 16)); - else if (info->type == VIRTIO_VSOCK_TYPE_STREAM) - pkt->hdr.len = cpu_to_le32(len); - - if (info->msg && len > 0) { - pkt->buf = kmalloc(len, GFP_KERNEL); - if (!pkt->buf) - goto out_pkt; - err = memcpy_from_msg(pkt->buf, info->msg, len); - if (err) - goto out; - } - - return pkt; - -out: - kfree(pkt->buf); -out_pkt: - kfree(pkt); - return NULL; -} -EXPORT_SYMBOL_GPL(virtio_transport_alloc_pkt); - -struct sock * -virtio_transport_get_pending(struct sock *listener, - struct virtio_vsock_pkt *pkt) -{ - struct vsock_sock *vlistener; - struct vsock_sock *vpending; - struct sockaddr_vm src; - struct sockaddr_vm dst; - struct sock *pending; - - vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid), le32_to_cpu(pkt->hdr.src_port)); - vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid), le32_to_cpu(pkt->hdr.dst_port)); - - vlistener = vsock_sk(listener); - list_for_each_entry(vpending, &vlistener->pending_links, - pending_links) { - if (vsock_addr_equals_addr(&src, &vpending->remote_addr) && - vsock_addr_equals_addr(&dst, &vpending->local_addr)) { - pending = sk_vsock(vpending); - sock_hold(pending); - return pending; - } - } - - return NULL; -} -EXPORT_SYMBOL_GPL(virtio_transport_get_pending); - -static void virtio_transport_inc_rx_pkt(struct virtio_vsock_pkt *pkt) -{ - pkt->trans->rx_bytes += pkt->len; -} - -static void virtio_transport_dec_rx_pkt(struct virtio_vsock_pkt *pkt) -{ - pkt->trans->rx_bytes -= pkt->len; - pkt->trans->fwd_cnt += pkt->len; -} - -void virtio_transport_inc_tx_pkt(struct virtio_vsock_pkt *pkt) -{ - mutex_lock(&pkt->trans->tx_lock); - pkt->hdr.fwd_cnt = cpu_to_le32(pkt->trans->fwd_cnt); - pkt->hdr.buf_alloc = cpu_to_le32(pkt->trans->buf_alloc); - mutex_unlock(&pkt->trans->tx_lock); -} -EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt); - -void virtio_transport_dec_tx_pkt(struct virtio_vsock_pkt *pkt) -{ -} -EXPORT_SYMBOL_GPL(virtio_transport_dec_tx_pkt); - -u32 virtio_transport_get_credit(struct virtio_transport *trans, u32 credit) -{ - u32 ret; - - mutex_lock(&trans->tx_lock); - ret = trans->peer_buf_alloc - (trans->tx_cnt - trans->peer_fwd_cnt); - if (ret > credit) - ret = credit; - trans->tx_cnt += ret; - mutex_unlock(&trans->tx_lock); - - pr_debug("%s: ret=%d, buf_alloc=%d, peer_buf_alloc=%d," - "tx_cnt=%d, fwd_cnt=%d, peer_fwd_cnt=%d\n", __func__, - ret, trans->buf_alloc, trans->peer_buf_alloc, - trans->tx_cnt, trans->fwd_cnt, trans->peer_fwd_cnt); - - return ret; -} -EXPORT_SYMBOL_GPL(virtio_transport_get_credit); - -void virtio_transport_put_credit(struct virtio_transport *trans, u32 credit) -{ - mutex_lock(&trans->tx_lock); - trans->tx_cnt -= credit; - mutex_unlock(&trans->tx_lock); -} -EXPORT_SYMBOL_GPL(virtio_transport_put_credit); - -static int virtio_transport_send_credit_update(struct vsock_sock *vsk, int type, struct virtio_vsock_hdr *hdr) -{ - struct virtio_transport *trans = vsk->trans; - struct virtio_vsock_pkt_info info = { - .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE, - .type = type, - }; - - if (hdr && type == VIRTIO_VSOCK_TYPE_DGRAM) { - info.remote_cid = le32_to_cpu(hdr->src_cid); - info.remote_port = le32_to_cpu(hdr->src_port); - } - - pr_debug("%s: sk=%p send_credit_update\n", __func__, vsk); - return trans->ops->send_pkt(vsk, &info); -} - -static int virtio_transport_send_credit_request(struct vsock_sock *vsk, int type) -{ - struct virtio_transport *trans = vsk->trans; - struct virtio_vsock_pkt_info info = { - .op = VIRTIO_VSOCK_OP_CREDIT_REQUEST, - .type = type, - }; - - pr_debug("%s: sk=%p send_credit_request\n", __func__, vsk); - return trans->ops->send_pkt(vsk, &info); -} - -static ssize_t -virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, - struct msghdr *msg, - size_t len) -{ - struct virtio_transport *trans = vsk->trans; - struct virtio_vsock_pkt *pkt; - size_t bytes, total = 0; - int err = -EFAULT; - - mutex_lock(&trans->rx_lock); - while (total < len && trans->rx_bytes > 0 && - !list_empty(&trans->rx_queue)) { - pkt = list_first_entry(&trans->rx_queue, - struct virtio_vsock_pkt, list); - - bytes = len - total; - if (bytes > pkt->len - pkt->off) - bytes = pkt->len - pkt->off; - - err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes); - if (err) - goto out; - total += bytes; - pkt->off += bytes; - if (pkt->off == pkt->len) { - virtio_transport_dec_rx_pkt(pkt); - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } - } - mutex_unlock(&trans->rx_lock); - - /* Send a credit pkt to peer */ - virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM, - NULL); - - return total; - -out: - mutex_unlock(&trans->rx_lock); - if (total) - err = total; - return err; -} - -ssize_t -virtio_transport_stream_dequeue(struct vsock_sock *vsk, - struct msghdr *msg, - size_t len, int flags) -{ - if (flags & MSG_PEEK) - return -EOPNOTSUPP; - - return virtio_transport_stream_do_dequeue(vsk, msg, len); -} -EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue); - -struct dgram_skb { - struct list_head list; - struct sk_buff *skb; - u16 id; -}; - -static struct dgram_skb *dgram_id_to_skb(struct virtio_transport *trans, - u16 id) -{ - struct dgram_skb *dgram_skb; - - list_for_each_entry(dgram_skb, &trans->incomplete_dgrams, list) { - if (dgram_skb->id == id) - return dgram_skb; - } - - return NULL; -} - -static void -virtio_transport_recv_dgram(struct sock *sk, - struct virtio_vsock_pkt *pkt) -{ - struct sk_buff *skb = NULL; - struct vsock_sock *vsk; - struct virtio_transport *trans; - size_t size; - u16 dgram_id, pkt_off, dgram_len, pkt_len; - u32 flags, len; - struct dgram_skb *dgram_skb; - - vsk = vsock_sk(sk); - trans = vsk->trans; - - /* len: dgram_len | pkt_len */ - len = le32_to_cpu(pkt->hdr.len); - dgram_len = len >> 16; - pkt_len = len & 0xFFFF; - - /* flags: dgram_id | pkt_off */ - flags = le32_to_cpu(pkt->hdr.flags); - dgram_id = flags >> 16; - pkt_off = flags & 0xFFFF; - - pr_debug("%s: dgram_len=%d, pkt_len=%d, id=%d, off=%d\n", __func__, - dgram_len, pkt_len, dgram_id, pkt_off); - - dgram_skb = dgram_id_to_skb(trans, dgram_id); - if (dgram_skb) { - /* This pkt is for a existing dgram */ - skb = dgram_skb->skb; - pr_debug("%s:found skb\n", __func__); - } - - /* Packet payload must be within datagram bounds */ - if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE) - goto drop; - if (pkt_len > dgram_len) - goto drop; - if (pkt_off > dgram_len) - goto drop; - if (dgram_len - pkt_off < pkt_len) - goto drop; - - if (!skb) { - /* This pkt is for a new dgram */ - pr_debug("%s:create skb\n", __func__); - - size = sizeof(pkt->hdr) + dgram_len; - /* Attach the packet to the socket's receive queue as an sk_buff. */ - dgram_skb = kzalloc(sizeof(struct dgram_skb), GFP_ATOMIC); - if (!dgram_skb) - goto drop; - - skb = alloc_skb(size, GFP_ATOMIC); - if (!skb) { - kfree(dgram_skb); - dgram_skb = NULL; - goto drop; - } - dgram_skb->id = dgram_id; - dgram_skb->skb = skb; - list_add_tail(&dgram_skb->list, &trans->incomplete_dgrams); - - /* sk_receive_skb() will do a sock_put(), so hold here. */ - sock_hold(sk); - skb_put(skb, size); - memcpy(skb->data, &pkt->hdr, sizeof(pkt->hdr)); - } - - memcpy(skb->data + sizeof(pkt->hdr) + pkt_off, pkt->buf, pkt_len); - - pr_debug("%s:C, off=%d, pkt_len=%d, dgram_len=%d\n", __func__, - pkt_off, pkt_len, dgram_len); - - /* We are done with this dgram */ - if (pkt_off + pkt_len == dgram_len) { - pr_debug("%s:dgram_id=%d is done\n", __func__, dgram_id); - list_del(&dgram_skb->list); - kfree(dgram_skb); - sk_receive_skb(sk, skb, 0); - } - virtio_transport_free_pkt(pkt); - return; - -drop: - if (dgram_skb) { - list_del(&dgram_skb->list); - kfree(dgram_skb); - kfree_skb(skb); - sock_put(sk); - } - virtio_transport_free_pkt(pkt); -} - -int -virtio_transport_dgram_dequeue(struct vsock_sock *vsk, - struct msghdr *msg, - size_t len, int flags) -{ - struct virtio_vsock_hdr *hdr; - struct sk_buff *skb; - int noblock; - int err; - int dgram_len; - - noblock = flags & MSG_DONTWAIT; - - if (flags & MSG_OOB || flags & MSG_ERRQUEUE) - return -EOPNOTSUPP; - - /* Retrieve the head sk_buff from the socket's receive queue. */ - err = 0; - skb = skb_recv_datagram(&vsk->sk, flags, noblock, &err); - if (err) - return err; - if (!skb) - return -EAGAIN; - - hdr = (struct virtio_vsock_hdr *)skb->data; - if (!hdr) - goto out; - - dgram_len = le32_to_cpu(hdr->len) >> 16; - /* Place the datagram payload in the user's iovec. */ - err = skb_copy_datagram_msg(skb, sizeof(*hdr), msg, dgram_len); - if (err) - goto out; - - if (msg->msg_name) { - /* Provide the address of the sender. */ - DECLARE_SOCKADDR(struct sockaddr_vm *, vm_addr, msg->msg_name); - vsock_addr_init(vm_addr, le32_to_cpu(hdr->src_cid), le32_to_cpu(hdr->src_port)); - msg->msg_namelen = sizeof(*vm_addr); - } - err = dgram_len; - - /* Send a credit pkt to peer */ - virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_DGRAM, hdr); - - pr_debug("%s:done, recved =%d\n", __func__, dgram_len); -out: - skb_free_datagram(&vsk->sk, skb); - return err; -} -EXPORT_SYMBOL_GPL(virtio_transport_dgram_dequeue); - -s64 virtio_transport_stream_has_data(struct vsock_sock *vsk) -{ - struct virtio_transport *trans = vsk->trans; - s64 bytes; - - mutex_lock(&trans->rx_lock); - bytes = trans->rx_bytes; - mutex_unlock(&trans->rx_lock); - - return bytes; -} -EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data); - -static s64 virtio_transport_has_space(struct vsock_sock *vsk) -{ - struct virtio_transport *trans = vsk->trans; - s64 bytes; - - bytes = trans->peer_buf_alloc - (trans->tx_cnt - trans->peer_fwd_cnt); - if (bytes < 0) - bytes = 0; - - return bytes; -} - -s64 virtio_transport_stream_has_space(struct vsock_sock *vsk) -{ - struct virtio_transport *trans = vsk->trans; - s64 bytes; - - mutex_lock(&trans->tx_lock); - bytes = virtio_transport_has_space(vsk); - mutex_unlock(&trans->tx_lock); - - pr_debug("%s: bytes=%lld\n", __func__, bytes); - - return bytes; -} -EXPORT_SYMBOL_GPL(virtio_transport_stream_has_space); - -int virtio_transport_do_socket_init(struct vsock_sock *vsk, - struct vsock_sock *psk) -{ - struct virtio_transport *trans; - - trans = kzalloc(sizeof(*trans), GFP_KERNEL); - if (!trans) - return -ENOMEM; - - vsk->trans = trans; - trans->vsk = vsk; - if (psk) { - struct virtio_transport *ptrans = psk->trans; - trans->buf_size = ptrans->buf_size; - trans->buf_size_min = ptrans->buf_size_min; - trans->buf_size_max = ptrans->buf_size_max; - trans->peer_buf_alloc = ptrans->peer_buf_alloc; - } else { - trans->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE; - trans->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE; - trans->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE; - } - - trans->buf_alloc = trans->buf_size; - - pr_debug("%s: trans->buf_alloc=%d\n", __func__, trans->buf_alloc); - - mutex_init(&trans->rx_lock); - mutex_init(&trans->tx_lock); - INIT_LIST_HEAD(&trans->rx_queue); - INIT_LIST_HEAD(&trans->incomplete_dgrams); - - return 0; -} -EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init); - -u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk) -{ - struct virtio_transport *trans = vsk->trans; - - return trans->buf_size; -} -EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size); - -u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk) -{ - struct virtio_transport *trans = vsk->trans; - - return trans->buf_size_min; -} -EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size); - -u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk) -{ - struct virtio_transport *trans = vsk->trans; - - return trans->buf_size_max; -} -EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size); - -void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val) -{ - struct virtio_transport *trans = vsk->trans; - - if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) - val = VIRTIO_VSOCK_MAX_BUF_SIZE; - if (val < trans->buf_size_min) - trans->buf_size_min = val; - if (val > trans->buf_size_max) - trans->buf_size_max = val; - trans->buf_size = val; - trans->buf_alloc = val; -} -EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size); - -void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val) -{ - struct virtio_transport *trans = vsk->trans; - - if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) - val = VIRTIO_VSOCK_MAX_BUF_SIZE; - if (val > trans->buf_size) - trans->buf_size = val; - trans->buf_size_min = val; -} -EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size); - -void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val) -{ - struct virtio_transport *trans = vsk->trans; - - if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) - val = VIRTIO_VSOCK_MAX_BUF_SIZE; - if (val < trans->buf_size) - trans->buf_size = val; - trans->buf_size_max = val; -} -EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size); - -int -virtio_transport_notify_poll_in(struct vsock_sock *vsk, - size_t target, - bool *data_ready_now) -{ - if (vsock_stream_has_data(vsk)) - *data_ready_now = true; - else - *data_ready_now = false; - - return 0; -} -EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_in); - -int -virtio_transport_notify_poll_out(struct vsock_sock *vsk, - size_t target, - bool *space_avail_now) -{ - s64 free_space; - - free_space = vsock_stream_has_space(vsk); - if (free_space > 0) - *space_avail_now = true; - else if (free_space == 0) - *space_avail_now = false; - - return 0; -} -EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_out); - -int virtio_transport_notify_recv_init(struct vsock_sock *vsk, - size_t target, struct vsock_transport_recv_notify_data *data) -{ - return 0; -} -EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_init); - -int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk, - size_t target, struct vsock_transport_recv_notify_data *data) -{ - return 0; -} -EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_block); - -int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk, - size_t target, struct vsock_transport_recv_notify_data *data) -{ - return 0; -} -EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_dequeue); - -int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk, - size_t target, ssize_t copied, bool data_read, - struct vsock_transport_recv_notify_data *data) -{ - return 0; -} -EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_post_dequeue); - -int virtio_transport_notify_send_init(struct vsock_sock *vsk, - struct vsock_transport_send_notify_data *data) -{ - return 0; -} -EXPORT_SYMBOL_GPL(virtio_transport_notify_send_init); - -int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk, - struct vsock_transport_send_notify_data *data) -{ - return 0; -} -EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_block); - -int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, - struct vsock_transport_send_notify_data *data) -{ - return 0; -} -EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_enqueue); - -int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, - ssize_t written, struct vsock_transport_send_notify_data *data) -{ - return 0; -} -EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue); - -u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk) -{ - struct virtio_transport *trans = vsk->trans; - - return trans->buf_size; -} -EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat); - -bool virtio_transport_stream_is_active(struct vsock_sock *vsk) -{ - return true; -} -EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active); - -bool virtio_transport_stream_allow(u32 cid, u32 port) -{ - return true; -} -EXPORT_SYMBOL_GPL(virtio_transport_stream_allow); - -int virtio_transport_dgram_bind(struct vsock_sock *vsk, - struct sockaddr_vm *addr) -{ - return vsock_bind_dgram_generic(vsk, addr); -} -EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind); - -bool virtio_transport_dgram_allow(u32 cid, u32 port) -{ - return true; -} -EXPORT_SYMBOL_GPL(virtio_transport_dgram_allow); - -int virtio_transport_connect(struct vsock_sock *vsk) -{ - struct virtio_transport *trans = vsk->trans; - struct virtio_vsock_pkt_info info = { - .op = VIRTIO_VSOCK_OP_REQUEST, - .type = VIRTIO_VSOCK_TYPE_STREAM, - }; - - pr_debug("%s: vsk=%p send_request\n", __func__, vsk); - return trans->ops->send_pkt(vsk, &info); -} -EXPORT_SYMBOL_GPL(virtio_transport_connect); - -int virtio_transport_shutdown(struct vsock_sock *vsk, int mode) -{ - struct virtio_transport *trans = vsk->trans; - struct virtio_vsock_pkt_info info = { - .op = VIRTIO_VSOCK_OP_SHUTDOWN, - .type = VIRTIO_VSOCK_TYPE_STREAM, - .flags = (mode & RCV_SHUTDOWN ? - VIRTIO_VSOCK_SHUTDOWN_RCV : 0) | - (mode & SEND_SHUTDOWN ? - VIRTIO_VSOCK_SHUTDOWN_SEND : 0), - }; - - pr_debug("%s: vsk=%p: send_shutdown\n", __func__, vsk); - return trans->ops->send_pkt(vsk, &info); -} -EXPORT_SYMBOL_GPL(virtio_transport_shutdown); - -void virtio_transport_release(struct vsock_sock *vsk) -{ - struct virtio_transport *trans = vsk->trans; - struct sock *sk = &vsk->sk; - struct dgram_skb *dgram_skb; - struct dgram_skb *dgram_skb_tmp; - - pr_debug("%s: vsk=%p\n", __func__, vsk); - - /* Tell other side to terminate connection */ - if (sk->sk_type == SOCK_STREAM && sk->sk_state == SS_CONNECTED) { - virtio_transport_shutdown(vsk, SHUTDOWN_MASK); - } - - /* Free incomplete dgrams */ - lock_sock(sk); - list_for_each_entry_safe(dgram_skb, dgram_skb_tmp, - &trans->incomplete_dgrams, list) { - list_del(&dgram_skb->list); - kfree_skb(dgram_skb->skb); - kfree(dgram_skb); - sock_put(sk); /* held in virtio_transport_recv_dgram() */ - } - release_sock(sk); -} -EXPORT_SYMBOL_GPL(virtio_transport_release); - -int -virtio_transport_dgram_enqueue(struct vsock_sock *vsk, - struct sockaddr_vm *remote_addr, - struct msghdr *msg, - size_t dgram_len) -{ - struct virtio_transport *trans = vsk->trans; - struct virtio_vsock_pkt_info info = { - .op = VIRTIO_VSOCK_OP_RW, - .type = VIRTIO_VSOCK_TYPE_DGRAM, - .msg = msg, - }; - size_t total_written = 0, pkt_off = 0, written; - u16 dgram_id; - - /* The max size of a single dgram we support is 64KB */ - if (dgram_len > VIRTIO_VSOCK_MAX_DGRAM_SIZE) - return -EMSGSIZE; - - info.dgram_len = dgram_len; - vsk->remote_addr = *remote_addr; - - dgram_id = trans->dgram_id++; - - /* TODO: To optimize, if we have enough credit to send the pkt already, - * do not ask the peer to send credit to use */ - virtio_transport_send_credit_request(vsk, VIRTIO_VSOCK_TYPE_DGRAM); - - while (total_written < dgram_len) { - info.pkt_len = dgram_len - total_written; - info.flags = dgram_id << 16 | pkt_off; - written = trans->ops->send_pkt(vsk, &info); - if (written < 0) - return -ENOMEM; - if (written == 0) { - /* TODO: if written = 0, we need a sleep & wakeup - * instead of sleep */ - pr_debug("%s: SHOULD WAIT written==0", __func__); - msleep(10); - } - total_written += written; - pkt_off += written; - pr_debug("%s:id=%d, dgram_len=%zu, off=%zu, total_written=%zu, written=%zu\n", - __func__, dgram_id, dgram_len, pkt_off, total_written, written); - } - - return dgram_len; -} -EXPORT_SYMBOL_GPL(virtio_transport_dgram_enqueue); - -ssize_t -virtio_transport_stream_enqueue(struct vsock_sock *vsk, - struct msghdr *msg, - size_t len) -{ - struct virtio_transport *trans = vsk->trans; - struct virtio_vsock_pkt_info info = { - .op = VIRTIO_VSOCK_OP_RW, - .type = VIRTIO_VSOCK_TYPE_STREAM, - .msg = msg, - .pkt_len = len, - }; - - return trans->ops->send_pkt(vsk, &info); -} -EXPORT_SYMBOL_GPL(virtio_transport_stream_enqueue); - -void virtio_transport_destruct(struct vsock_sock *vsk) -{ - struct virtio_transport *trans = vsk->trans; - - pr_debug("%s: vsk=%p\n", __func__, vsk); - kfree(trans); -} -EXPORT_SYMBOL_GPL(virtio_transport_destruct); - -static int virtio_transport_send_ack(struct vsock_sock *vsk, u32 cookie) -{ - struct virtio_transport *trans = vsk->trans; - struct virtio_vsock_pkt_info info = { - .op = VIRTIO_VSOCK_OP_ACK, - .type = VIRTIO_VSOCK_TYPE_STREAM, - .flags = cpu_to_le32(cookie), - }; - - pr_debug("%s: sk=%p send_offer\n", __func__, vsk); - return trans->ops->send_pkt(vsk, &info); -} - -static int virtio_transport_send_reset(struct vsock_sock *vsk, - struct virtio_vsock_pkt *pkt) -{ - struct virtio_transport *trans = vsk->trans; - struct virtio_vsock_pkt_info info = { - .op = VIRTIO_VSOCK_OP_RST, - .type = VIRTIO_VSOCK_TYPE_STREAM, - }; - - pr_debug("%s\n", __func__); - - /* Send RST only if the original pkt is not a RST pkt */ - if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) - return 0; - - return trans->ops->send_pkt(vsk, &info); -} - -static int -virtio_transport_recv_connecting(struct sock *sk, - struct virtio_vsock_pkt *pkt) -{ - struct vsock_sock *vsk = vsock_sk(sk); - int err; - int skerr; - u32 cookie; - - pr_debug("%s: vsk=%p\n", __func__, vsk); - switch (le16_to_cpu(pkt->hdr.op)) { - case VIRTIO_VSOCK_OP_RESPONSE: - cookie = le32_to_cpu(pkt->hdr.flags); - pr_debug("%s: got RESPONSE and send ACK, cookie=%x\n", __func__, cookie); - err = virtio_transport_send_ack(vsk, cookie); - if (err < 0) { - skerr = -err; - goto destroy; - } - sk->sk_state = SS_CONNECTED; - sk->sk_socket->state = SS_CONNECTED; - vsock_insert_connected(vsk); - sk->sk_state_change(sk); - break; - case VIRTIO_VSOCK_OP_INVALID: - pr_debug("%s: got invalid\n", __func__); - break; - case VIRTIO_VSOCK_OP_RST: - pr_debug("%s: got rst\n", __func__); - skerr = ECONNRESET; - err = 0; - goto destroy; - default: - pr_debug("%s: got def\n", __func__); - skerr = EPROTO; - err = -EINVAL; - goto destroy; - } - return 0; - -destroy: - virtio_transport_send_reset(vsk, pkt); - sk->sk_state = SS_UNCONNECTED; - sk->sk_err = skerr; - sk->sk_error_report(sk); - return err; -} - -static int -virtio_transport_recv_connected(struct sock *sk, - struct virtio_vsock_pkt *pkt) -{ - struct vsock_sock *vsk = vsock_sk(sk); - struct virtio_transport *trans = vsk->trans; - int err = 0; - - switch (le16_to_cpu(pkt->hdr.op)) { - case VIRTIO_VSOCK_OP_RW: - pkt->len = le32_to_cpu(pkt->hdr.len); - pkt->off = 0; - pkt->trans = trans; - - mutex_lock(&trans->rx_lock); - virtio_transport_inc_rx_pkt(pkt); - list_add_tail(&pkt->list, &trans->rx_queue); - mutex_unlock(&trans->rx_lock); - - sk->sk_data_ready(sk); - return err; - case VIRTIO_VSOCK_OP_CREDIT_UPDATE: - sk->sk_write_space(sk); - break; - case VIRTIO_VSOCK_OP_SHUTDOWN: - pr_debug("%s: got shutdown\n", __func__); - if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV) - vsk->peer_shutdown |= RCV_SHUTDOWN; - if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) - vsk->peer_shutdown |= SEND_SHUTDOWN; - if (le32_to_cpu(pkt->hdr.flags)) - sk->sk_state_change(sk); - break; - case VIRTIO_VSOCK_OP_RST: - pr_debug("%s: got rst\n", __func__); - sock_set_flag(sk, SOCK_DONE); - vsk->peer_shutdown = SHUTDOWN_MASK; - if (vsock_stream_has_data(vsk) <= 0) - sk->sk_state = SS_DISCONNECTING; - sk->sk_state_change(sk); - break; - default: - err = -EINVAL; - break; - } - - virtio_transport_free_pkt(pkt); - return err; -} - -static int -virtio_transport_send_response(struct vsock_sock *vsk, - struct virtio_vsock_pkt *pkt) -{ - struct virtio_transport *trans = vsk->trans; - struct virtio_vsock_pkt_info info = { - .op = VIRTIO_VSOCK_OP_RESPONSE, - .type = VIRTIO_VSOCK_TYPE_STREAM, - .remote_cid = le32_to_cpu(pkt->hdr.src_cid), - .remote_port = le32_to_cpu(pkt->hdr.src_port), - }; - u32 cookie; - - cookie = virtio_vsock_secure_cookie(le32_to_cpu(pkt->hdr.src_cid), - le32_to_cpu(pkt->hdr.dst_cid), - le32_to_cpu(pkt->hdr.src_port), - le32_to_cpu(pkt->hdr.dst_port), - jiffies / (HZ * 60)); - info.flags = cpu_to_le32(cookie); - - pr_debug("%s: send_response, cookie=%x\n", __func__, le32_to_cpu(cookie)); - - return trans->ops->send_pkt(vsk, &info); -} - -/* Handle server socket */ -static int -virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) -{ - struct vsock_sock *vsk = vsock_sk(sk); - struct vsock_sock *vpending; - struct sock *pending; - int err; - u32 cookie; - - switch (le16_to_cpu(pkt->hdr.op)) { - case VIRTIO_VSOCK_OP_REQUEST: - err = virtio_transport_send_response(vsk, pkt); - if (err < 0) { - // FIXME vsk should be vpending - virtio_transport_send_reset(vsk, pkt); - return err; - } - break; - case VIRTIO_VSOCK_OP_ACK: - cookie = le32_to_cpu(pkt->hdr.flags); - err = virtio_vsock_check_cookie(le32_to_cpu(pkt->hdr.src_cid), - le32_to_cpu(pkt->hdr.dst_cid), - le32_to_cpu(pkt->hdr.src_port), - le32_to_cpu(pkt->hdr.dst_port), - jiffies / (HZ * 60), - le32_to_cpu(pkt->hdr.flags), - VSOCK_TIMEOUT_INIT); - pr_debug("%s: cookie=%x, err=%d\n", __func__, cookie, err); - if (err) - return err; - - /* So no pending socket are responsible for this pkt, create one */ - pr_debug("%s: create pending\n", __func__); - pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, - sk->sk_type, 0); - if (!pending) { - virtio_transport_send_reset(vsk, pkt); - return -ENOMEM; - } - sk->sk_ack_backlog++; - pending->sk_state = SS_CONNECTING; - - vpending = vsock_sk(pending); - vsock_addr_init(&vpending->local_addr, le32_to_cpu(pkt->hdr.dst_cid), - le32_to_cpu(pkt->hdr.dst_port)); - vsock_addr_init(&vpending->remote_addr, le32_to_cpu(pkt->hdr.src_cid), - le32_to_cpu(pkt->hdr.src_port)); - vsock_add_pending(sk, pending); - - pr_debug("%s: get pending\n", __func__); - pending = virtio_transport_get_pending(sk, pkt); - vpending = vsock_sk(pending); - lock_sock(pending); - switch (pending->sk_state) { - case SS_CONNECTING: - if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_ACK) { - pr_debug("%s: op=%d != OP_ACK\n", __func__, - le16_to_cpu(pkt->hdr.op)); - virtio_transport_send_reset(vpending, pkt); - pending->sk_err = EPROTO; - pending->sk_state = SS_UNCONNECTED; - sock_put(pending); - } else { - pending->sk_state = SS_CONNECTED; - vsock_insert_connected(vpending); - - vsock_remove_pending(sk, pending); - vsock_enqueue_accept(sk, pending); - - sk->sk_data_ready(sk); - } - err = 0; - break; - default: - pr_debug("%s: sk->sk_ack_backlog=%d\n", __func__, - sk->sk_ack_backlog); - virtio_transport_send_reset(vpending, pkt); - err = -EINVAL; - break; - } - if (err < 0) - vsock_remove_pending(sk, pending); - release_sock(pending); - - /* Release refcnt obtained in virtio_transport_get_pending */ - sock_put(pending); - break; - default: - break; - } - - return 0; -} - -static void virtio_transport_space_update(struct sock *sk, - struct virtio_vsock_pkt *pkt) -{ - struct vsock_sock *vsk = vsock_sk(sk); - struct virtio_transport *trans = vsk->trans; - bool space_available; - - /* buf_alloc and fwd_cnt is always included in the hdr */ - mutex_lock(&trans->tx_lock); - trans->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc); - trans->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt); - space_available = virtio_transport_has_space(vsk); - mutex_unlock(&trans->tx_lock); - - if (space_available) - sk->sk_write_space(sk); -} - -/* We are under the virtio-vsock's vsock->rx_lock or - * vhost-vsock's vq->mutex lock */ -void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) -{ - struct virtio_transport *trans; - struct sockaddr_vm src, dst; - struct vsock_sock *vsk; - struct sock *sk; - - vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid), le32_to_cpu(pkt->hdr.src_port)); - vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid), le32_to_cpu(pkt->hdr.dst_port)); - - virtio_vsock_dumppkt(__func__, pkt); - - if (le16_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_DGRAM) { - sk = vsock_find_unbound_socket(&dst); - if (!sk) - goto free_pkt; - - vsk = vsock_sk(sk); - trans = vsk->trans; - BUG_ON(!trans); - - virtio_transport_space_update(sk, pkt); - - lock_sock(sk); - switch (le16_to_cpu(pkt->hdr.op)) { - case VIRTIO_VSOCK_OP_CREDIT_UPDATE: - virtio_transport_free_pkt(pkt); - break; - case VIRTIO_VSOCK_OP_CREDIT_REQUEST: - virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_DGRAM, - &pkt->hdr); - virtio_transport_free_pkt(pkt); - break; - case VIRTIO_VSOCK_OP_RW: - virtio_transport_recv_dgram(sk, pkt); - break; - default: - virtio_transport_free_pkt(pkt); - break; - } - release_sock(sk); - - /* Release refcnt obtained when we fetched this socket out of - * the unbound list. - */ - sock_put(sk); - return; - } else if (le16_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_STREAM) { - /* The socket must be in connected or bound table - * otherwise send reset back - */ - sk = vsock_find_connected_socket(&src, &dst); - if (!sk) { - sk = vsock_find_bound_socket(&dst); - if (!sk) { - pr_debug("%s: can not find bound_socket\n", __func__); - virtio_vsock_dumppkt(__func__, pkt); - /* Ignore this pkt instead of sending reset back */ - /* TODO send a RST unless this packet is a RST (to avoid infinite loops) */ - goto free_pkt; - } - } - - vsk = vsock_sk(sk); - trans = vsk->trans; - BUG_ON(!trans); - - virtio_transport_space_update(sk, pkt); - - lock_sock(sk); - switch (sk->sk_state) { - case VSOCK_SS_LISTEN: - virtio_transport_recv_listen(sk, pkt); - virtio_transport_free_pkt(pkt); - break; - case SS_CONNECTING: - virtio_transport_recv_connecting(sk, pkt); - virtio_transport_free_pkt(pkt); - break; - case SS_CONNECTED: - virtio_transport_recv_connected(sk, pkt); - break; - default: - virtio_transport_free_pkt(pkt); - break; - } - release_sock(sk); - - /* Release refcnt obtained when we fetched this socket out of the - * bound or connected list. - */ - sock_put(sk); - } - return; - -free_pkt: - virtio_transport_free_pkt(pkt); -} -EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt); - -void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt) -{ - kfree(pkt->buf); - kfree(pkt); -} -EXPORT_SYMBOL_GPL(virtio_transport_free_pkt); - -static int __init virtio_vsock_common_init(void) -{ - get_random_bytes(vsockcookie_secret, sizeof(vsockcookie_secret)); - return 0; -} - -static void __exit virtio_vsock_common_exit(void) -{ -} - -module_init(virtio_vsock_common_init); -module_exit(virtio_vsock_common_exit); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Asias He"); -MODULE_DESCRIPTION("common code for virtio vsock"); -- cgit v1.2.3 From 33d5a7b14bfd02e60af9d223db8dfff0cbcabe6b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 28 Nov 2015 21:53:04 +0100 Subject: netfilter: nf_tables: extend tracing infrastructure nft monitor mode can then decode and display this trace data. Parts of LL/Network/Transport headers are provided as separate attributes. Otherwise, printing IP address data becomes virtually impossible for userspace since in the case of the netdev family we really don't want userspace to have to know all the possible link layer types and/or sizes just to display/print an ip address. We also don't want userspace to have to follow ipv6 header chains to get the s/dport info, the kernel already did this work for us. To avoid bloating nft_do_chain all data required for tracing is encapsulated in nft_traceinfo. The structure is initialized unconditionally(!) for each nft_do_chain invocation. This unconditionall call will be moved under a static key in a followup patch. With lots of help from Patrick McHardy and Pablo Neira. Signed-off-by: Florian Westphal Acked-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 32 ++++ include/uapi/linux/netfilter/nf_tables.h | 52 ++++++ include/uapi/linux/netfilter/nfnetlink.h | 2 + net/netfilter/Makefile | 2 +- net/netfilter/nf_tables_api.c | 12 +- net/netfilter/nf_tables_core.c | 45 +++-- net/netfilter/nf_tables_trace.c | 271 +++++++++++++++++++++++++++++++ net/netfilter/nfnetlink.c | 1 + 8 files changed, 398 insertions(+), 19 deletions(-) create mode 100644 net/netfilter/nf_tables_trace.c (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 101d7d7ec243..b313cda49194 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -888,6 +888,38 @@ void nft_unregister_chain_type(const struct nf_chain_type *); int nft_register_expr(struct nft_expr_type *); void nft_unregister_expr(struct nft_expr_type *); +int nft_verdict_dump(struct sk_buff *skb, int type, + const struct nft_verdict *v); + +/** + * struct nft_traceinfo - nft tracing information and state + * + * @pkt: pktinfo currently processed + * @basechain: base chain currently processed + * @chain: chain currently processed + * @rule: rule that was evaluated + * @verdict: verdict given by rule + * @type: event type (enum nft_trace_types) + * @packet_dumped: packet headers sent in a previous traceinfo message + * @trace: other struct members are initialised + */ +struct nft_traceinfo { + const struct nft_pktinfo *pkt; + const struct nft_base_chain *basechain; + const struct nft_chain *chain; + const struct nft_rule *rule; + const struct nft_verdict *verdict; + enum nft_trace_types type; + bool packet_dumped; + bool trace; +}; + +void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, + const struct nft_verdict *verdict, + const struct nft_chain *basechain); + +void nft_trace_notify(struct nft_traceinfo *info); + #define nft_dereference(p) \ nfnl_dereference(p, NFNL_SUBSYS_NFTABLES) diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 5f3ececf84b3..b48a3ab761f8 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -83,6 +83,7 @@ enum nft_verdicts { * @NFT_MSG_DELSETELEM: delete a set element (enum nft_set_elem_attributes) * @NFT_MSG_NEWGEN: announce a new generation, only for events (enum nft_gen_attributes) * @NFT_MSG_GETGEN: get the rule-set generation (enum nft_gen_attributes) + * @NFT_MSG_TRACE: trace event (enum nft_trace_attributes) */ enum nf_tables_msg_types { NFT_MSG_NEWTABLE, @@ -102,6 +103,7 @@ enum nf_tables_msg_types { NFT_MSG_DELSETELEM, NFT_MSG_NEWGEN, NFT_MSG_GETGEN, + NFT_MSG_TRACE, NFT_MSG_MAX, }; @@ -987,4 +989,54 @@ enum nft_gen_attributes { }; #define NFTA_GEN_MAX (__NFTA_GEN_MAX - 1) +/** + * enum nft_trace_attributes - nf_tables trace netlink attributes + * + * @NFTA_TRACE_TABLE: name of the table (NLA_STRING) + * @NFTA_TRACE_CHAIN: name of the chain (NLA_STRING) + * @NFTA_TRACE_RULE_HANDLE: numeric handle of the rule (NLA_U64) + * @NFTA_TRACE_TYPE: type of the event (NLA_U32: nft_trace_types) + * @NFTA_TRACE_VERDICT: verdict returned by hook (NLA_NESTED: nft_verdicts) + * @NFTA_TRACE_ID: pseudo-id, same for each skb traced (NLA_U32) + * @NFTA_TRACE_LL_HEADER: linklayer header (NLA_BINARY) + * @NFTA_TRACE_NETWORK_HEADER: network header (NLA_BINARY) + * @NFTA_TRACE_TRANSPORT_HEADER: transport header (NLA_BINARY) + * @NFTA_TRACE_IIF: indev ifindex (NLA_U32) + * @NFTA_TRACE_IIFTYPE: netdev->type of indev (NLA_U16) + * @NFTA_TRACE_OIF: outdev ifindex (NLA_U32) + * @NFTA_TRACE_OIFTYPE: netdev->type of outdev (NLA_U16) + * @NFTA_TRACE_MARK: nfmark (NLA_U32) + * @NFTA_TRACE_NFPROTO: nf protocol processed (NLA_U32) + * @NFTA_TRACE_POLICY: policy that decided fate of packet (NLA_U32) + */ +enum nft_trace_attibutes { + NFTA_TRACE_UNSPEC, + NFTA_TRACE_TABLE, + NFTA_TRACE_CHAIN, + NFTA_TRACE_RULE_HANDLE, + NFTA_TRACE_TYPE, + NFTA_TRACE_VERDICT, + NFTA_TRACE_ID, + NFTA_TRACE_LL_HEADER, + NFTA_TRACE_NETWORK_HEADER, + NFTA_TRACE_TRANSPORT_HEADER, + NFTA_TRACE_IIF, + NFTA_TRACE_IIFTYPE, + NFTA_TRACE_OIF, + NFTA_TRACE_OIFTYPE, + NFTA_TRACE_MARK, + NFTA_TRACE_NFPROTO, + NFTA_TRACE_POLICY, + __NFTA_TRACE_MAX +}; +#define NFTA_TRACE_MAX (__NFTA_TRACE_MAX - 1) + +enum nft_trace_types { + NFT_TRACETYPE_UNSPEC, + NFT_TRACETYPE_POLICY, + NFT_TRACETYPE_RETURN, + NFT_TRACETYPE_RULE, + __NFT_TRACETYPE_MAX +}; +#define NFT_TRACETYPE_MAX (__NFT_TRACETYPE_MAX - 1) #endif /* _LINUX_NF_TABLES_H */ diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h index 354a7e5e50f2..4bb8cb7730e7 100644 --- a/include/uapi/linux/netfilter/nfnetlink.h +++ b/include/uapi/linux/netfilter/nfnetlink.h @@ -22,6 +22,8 @@ enum nfnetlink_groups { #define NFNLGRP_NFTABLES NFNLGRP_NFTABLES NFNLGRP_ACCT_QUOTA, #define NFNLGRP_ACCT_QUOTA NFNLGRP_ACCT_QUOTA + NFNLGRP_NFTRACE, +#define NFNLGRP_NFTRACE NFNLGRP_NFTRACE __NFNLGRP_MAX, }; #define NFNLGRP_MAX (__NFNLGRP_MAX - 1) diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 7638c36b498c..22934846b5d1 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -67,7 +67,7 @@ obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o # nf_tables -nf_tables-objs += nf_tables_core.o nf_tables_api.o +nf_tables-objs += nf_tables_core.o nf_tables_api.o nf_tables_trace.o nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o nft_dynset.o nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 93cc4737018f..c4969a0d54ba 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4446,22 +4446,22 @@ static void nft_verdict_uninit(const struct nft_data *data) } } -static int nft_verdict_dump(struct sk_buff *skb, const struct nft_data *data) +int nft_verdict_dump(struct sk_buff *skb, int type, const struct nft_verdict *v) { struct nlattr *nest; - nest = nla_nest_start(skb, NFTA_DATA_VERDICT); + nest = nla_nest_start(skb, type); if (!nest) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(data->verdict.code))) + if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(v->code))) goto nla_put_failure; - switch (data->verdict.code) { + switch (v->code) { case NFT_JUMP: case NFT_GOTO: if (nla_put_string(skb, NFTA_VERDICT_CHAIN, - data->verdict.chain->name)) + v->chain->name)) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -4572,7 +4572,7 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, err = nft_value_dump(skb, data, len); break; case NFT_DATA_VERDICT: - err = nft_verdict_dump(skb, data); + err = nft_verdict_dump(skb, NFTA_DATA_VERDICT, &data->verdict); break; default: err = -EINVAL; diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index f3695a497408..2395de7c8ab2 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -44,22 +44,36 @@ static struct nf_loginfo trace_loginfo = { }, }; -static void __nft_trace_packet(const struct nft_pktinfo *pkt, - const struct nft_chain *chain, - int rulenum, enum nft_trace type) +static noinline void __nft_trace_packet(struct nft_traceinfo *info, + const struct nft_chain *chain, + int rulenum, enum nft_trace type) { + const struct nft_pktinfo *pkt = info->pkt; + + if (!pkt->skb->nf_trace) + return; + + info->chain = chain; + info->type = type; + + nft_trace_notify(info); + nf_log_trace(pkt->net, pkt->pf, pkt->hook, pkt->skb, pkt->in, pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", chain->table->name, chain->name, comments[type], rulenum); } -static inline void nft_trace_packet(const struct nft_pktinfo *pkt, +static inline void nft_trace_packet(struct nft_traceinfo *info, const struct nft_chain *chain, - int rulenum, enum nft_trace type) + const struct nft_rule *rule, + int rulenum, + enum nft_trace_types type) { - if (unlikely(pkt->skb->nf_trace)) - __nft_trace_packet(pkt, chain, rulenum, type); + if (unlikely(info->trace)) { + info->rule = rule; + __nft_trace_packet(info, chain, rulenum, type); + } } static void nft_cmp_fast_eval(const struct nft_expr *expr, @@ -121,7 +135,9 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv) struct nft_stats *stats; int rulenum; unsigned int gencursor = nft_genmask_cur(net); + struct nft_traceinfo info; + nft_trace_init(&info, pkt, ®s.verdict, basechain); do_chain: rulenum = 0; rule = list_entry(&chain->rules, struct nft_rule, list); @@ -151,7 +167,8 @@ next_rule: regs.verdict.code = NFT_CONTINUE; continue; case NFT_CONTINUE: - nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + nft_trace_packet(&info, chain, rule, + rulenum, NFT_TRACETYPE_RULE); continue; } break; @@ -161,7 +178,8 @@ next_rule: case NF_ACCEPT: case NF_DROP: case NF_QUEUE: - nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + nft_trace_packet(&info, chain, rule, + rulenum, NFT_TRACETYPE_RULE); return regs.verdict.code; } @@ -174,7 +192,8 @@ next_rule: stackptr++; /* fall through */ case NFT_GOTO: - nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + nft_trace_packet(&info, chain, rule, + rulenum, NFT_TRACETYPE_RULE); chain = regs.verdict.chain; goto do_chain; @@ -182,7 +201,8 @@ next_rule: rulenum++; /* fall through */ case NFT_RETURN: - nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RETURN); + nft_trace_packet(&info, chain, rule, + rulenum, NFT_TRACETYPE_RETURN); break; default: WARN_ON(1); @@ -196,7 +216,8 @@ next_rule: goto next_rule; } - nft_trace_packet(pkt, basechain, -1, NFT_TRACE_POLICY); + nft_trace_packet(&info, basechain, NULL, -1, + NFT_TRACETYPE_POLICY); rcu_read_lock_bh(); stats = this_cpu_ptr(rcu_dereference(nft_base_chain(basechain)->stats)); diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c new file mode 100644 index 000000000000..36fd7ad6729a --- /dev/null +++ b/net/netfilter/nf_tables_trace.c @@ -0,0 +1,271 @@ +/* + * (C) 2015 Red Hat GmbH + * Author: Florian Westphal + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NFT_TRACETYPE_LL_HSIZE 20 +#define NFT_TRACETYPE_NETWORK_HSIZE 40 +#define NFT_TRACETYPE_TRANSPORT_HSIZE 20 + +static int trace_fill_id(struct sk_buff *nlskb, struct sk_buff *skb) +{ + __be32 id; + + /* using skb address as ID results in a limited number of + * values (and quick reuse). + * + * So we attempt to use as many skb members that will not + * change while skb is with netfilter. + */ + id = (__be32)jhash_2words(hash32_ptr(skb), skb_get_hash(skb), + skb->skb_iif); + + return nla_put_be32(nlskb, NFTA_TRACE_ID, id); +} + +static int trace_fill_header(struct sk_buff *nlskb, u16 type, + const struct sk_buff *skb, + int off, unsigned int len) +{ + struct nlattr *nla; + + if (len == 0) + return 0; + + nla = nla_reserve(nlskb, type, len); + if (!nla || skb_copy_bits(skb, off, nla_data(nla), len)) + return -1; + + return 0; +} + +static int nf_trace_fill_ll_header(struct sk_buff *nlskb, + const struct sk_buff *skb) +{ + struct vlan_ethhdr veth; + int off; + + BUILD_BUG_ON(sizeof(veth) > NFT_TRACETYPE_LL_HSIZE); + + off = skb_mac_header(skb) - skb->data; + if (off != -ETH_HLEN) + return -1; + + if (skb_copy_bits(skb, off, &veth, ETH_HLEN)) + return -1; + + veth.h_vlan_proto = skb->vlan_proto; + veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); + veth.h_vlan_encapsulated_proto = skb->protocol; + + return nla_put(nlskb, NFTA_TRACE_LL_HEADER, sizeof(veth), &veth); +} + +static int nf_trace_fill_dev_info(struct sk_buff *nlskb, + const struct net_device *indev, + const struct net_device *outdev) +{ + if (indev) { + if (nla_put_be32(nlskb, NFTA_TRACE_IIF, + htonl(indev->ifindex))) + return -1; + + if (nla_put_be16(nlskb, NFTA_TRACE_IIFTYPE, + htons(indev->type))) + return -1; + } + + if (outdev) { + if (nla_put_be32(nlskb, NFTA_TRACE_OIF, + htonl(outdev->ifindex))) + return -1; + + if (nla_put_be16(nlskb, NFTA_TRACE_OIFTYPE, + htons(outdev->type))) + return -1; + } + + return 0; +} + +static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, + const struct nft_pktinfo *pkt) +{ + const struct sk_buff *skb = pkt->skb; + unsigned int len = min_t(unsigned int, + pkt->xt.thoff - skb_network_offset(skb), + NFT_TRACETYPE_NETWORK_HSIZE); + int off = skb_network_offset(skb); + + if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len)) + return -1; + + len = min_t(unsigned int, skb->len - pkt->xt.thoff, + NFT_TRACETYPE_TRANSPORT_HSIZE); + + if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb, + pkt->xt.thoff, len)) + return -1; + + if (!skb_mac_header_was_set(skb)) + return 0; + + if (skb_vlan_tag_get(skb)) + return nf_trace_fill_ll_header(nlskb, skb); + + off = skb_mac_header(skb) - skb->data; + len = min_t(unsigned int, -off, NFT_TRACETYPE_LL_HSIZE); + return trace_fill_header(nlskb, NFTA_TRACE_LL_HEADER, + skb, off, len); +} + +static int nf_trace_fill_rule_info(struct sk_buff *nlskb, + const struct nft_traceinfo *info) +{ + if (!info->rule) + return 0; + + /* a continue verdict with ->type == RETURN means that this is + * an implicit return (end of chain reached). + * + * Since no rule matched, the ->rule pointer is invalid. + */ + if (info->type == NFT_TRACETYPE_RETURN && + info->verdict->code == NFT_CONTINUE) + return 0; + + return nla_put_be64(nlskb, NFTA_TRACE_RULE_HANDLE, + cpu_to_be64(info->rule->handle)); +} + +void nft_trace_notify(struct nft_traceinfo *info) +{ + const struct nft_pktinfo *pkt = info->pkt; + struct nfgenmsg *nfmsg; + struct nlmsghdr *nlh; + struct sk_buff *skb; + unsigned int size; + int event = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_TRACE; + + if (!nfnetlink_has_listeners(pkt->net, NFNLGRP_NFTRACE)) + return; + + size = nlmsg_total_size(sizeof(struct nfgenmsg)) + + nla_total_size(NFT_TABLE_MAXNAMELEN) + + nla_total_size(NFT_CHAIN_MAXNAMELEN) + + nla_total_size(sizeof(__be64)) + /* rule handle */ + nla_total_size(sizeof(__be32)) + /* trace type */ + nla_total_size(0) + /* VERDICT, nested */ + nla_total_size(sizeof(u32)) + /* verdict code */ + nla_total_size(NFT_CHAIN_MAXNAMELEN) + /* jump target */ + nla_total_size(sizeof(u32)) + /* id */ + nla_total_size(NFT_TRACETYPE_LL_HSIZE) + + nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) + + nla_total_size(NFT_TRACETYPE_TRANSPORT_HSIZE) + + nla_total_size(sizeof(u32)) + /* iif */ + nla_total_size(sizeof(__be16)) + /* iiftype */ + nla_total_size(sizeof(u32)) + /* oif */ + nla_total_size(sizeof(__be16)) + /* oiftype */ + nla_total_size(sizeof(u32)) + /* mark */ + nla_total_size(sizeof(u32)) + /* nfproto */ + nla_total_size(sizeof(u32)); /* policy */ + + skb = nlmsg_new(size, GFP_ATOMIC); + if (!skb) + return; + + nlh = nlmsg_put(skb, 0, 0, event, sizeof(struct nfgenmsg), 0); + if (!nlh) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = info->basechain->type->family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_be32(skb, NFTA_TRACE_NFPROTO, htonl(pkt->pf))) + goto nla_put_failure; + + if (nla_put_be32(skb, NFTA_TRACE_TYPE, htonl(info->type))) + goto nla_put_failure; + + if (trace_fill_id(skb, pkt->skb)) + goto nla_put_failure; + + if (info->chain) { + if (nla_put_string(skb, NFTA_TRACE_CHAIN, + info->chain->name)) + goto nla_put_failure; + if (nla_put_string(skb, NFTA_TRACE_TABLE, + info->chain->table->name)) + goto nla_put_failure; + } + + if (nf_trace_fill_rule_info(skb, info)) + goto nla_put_failure; + + switch (info->type) { + case NFT_TRACETYPE_UNSPEC: + case __NFT_TRACETYPE_MAX: + break; + case NFT_TRACETYPE_RETURN: + case NFT_TRACETYPE_RULE: + if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, info->verdict)) + goto nla_put_failure; + break; + case NFT_TRACETYPE_POLICY: + if (nla_put_be32(skb, NFTA_TRACE_POLICY, + info->basechain->policy)) + goto nla_put_failure; + break; + } + + if (pkt->skb->mark && + nla_put_be32(skb, NFTA_TRACE_MARK, htonl(pkt->skb->mark))) + goto nla_put_failure; + + if (!info->packet_dumped) { + if (nf_trace_fill_dev_info(skb, pkt->in, pkt->out)) + goto nla_put_failure; + + if (nf_trace_fill_pkt_info(skb, pkt)) + goto nla_put_failure; + info->packet_dumped = true; + } + + nlmsg_end(skb, nlh); + nfnetlink_send(skb, pkt->net, 0, NFNLGRP_NFTRACE, 0, GFP_ATOMIC); + return; + + nla_put_failure: + WARN_ON_ONCE(1); + kfree_skb(skb); +} + +void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, + const struct nft_verdict *verdict, + const struct nft_chain *chain) +{ + info->basechain = nft_base_chain(chain); + info->trace = true; + info->packet_dumped = false; + info->pkt = pkt; + info->verdict = verdict; +} diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 46453ab318db..28591fa94ba5 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -49,6 +49,7 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = { [NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP, [NFNLGRP_NFTABLES] = NFNL_SUBSYS_NFTABLES, [NFNLGRP_ACCT_QUOTA] = NFNL_SUBSYS_ACCT, + [NFNLGRP_NFTRACE] = NFNL_SUBSYS_NFTABLES, }; void nfnl_lock(__u8 subsys_id) -- cgit v1.2.3 From abe492b4f50c3ae2ebcfaa2f5c16176aebaa1c68 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 10 Dec 2015 12:37:45 -0800 Subject: geneve: UDP checksum configuration via netlink Add support to enable and disable UDP checksums via netlink. This is similar to how VXLAN and GUE allow this. This includes support for enabling the UDP zero checksum (for both TX and RX). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/net/geneve.c | 93 +++++++++++++++++++++++++++++++++----------- include/uapi/linux/if_link.h | 3 ++ 2 files changed, 73 insertions(+), 23 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index de5c30c9f059..0750d7a93878 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -71,8 +71,14 @@ struct geneve_dev { __be16 dst_port; bool collect_md; struct gro_cells gro_cells; + u32 flags; }; +/* Geneve device flags */ +#define GENEVE_F_UDP_CSUM BIT(0) +#define GENEVE_F_UDP_ZERO_CSUM6_TX BIT(1) +#define GENEVE_F_UDP_ZERO_CSUM6_RX BIT(2) + struct geneve_sock { bool collect_md; struct list_head list; @@ -81,6 +87,7 @@ struct geneve_sock { int refcnt; struct udp_offload udp_offloads; struct hlist_head vni_list[VNI_HASH_SIZE]; + u32 flags; }; static inline __u32 geneve_net_vni_hash(u8 vni[3]) @@ -343,7 +350,7 @@ error: } static struct socket *geneve_create_sock(struct net *net, bool ipv6, - __be16 port) + __be16 port, u32 flags) { struct socket *sock; struct udp_port_cfg udp_conf; @@ -354,6 +361,8 @@ static struct socket *geneve_create_sock(struct net *net, bool ipv6, if (ipv6) { udp_conf.family = AF_INET6; udp_conf.ipv6_v6only = 1; + udp_conf.use_udp6_rx_checksums = + !(flags & GENEVE_F_UDP_ZERO_CSUM6_RX); } else { udp_conf.family = AF_INET; udp_conf.local_ip.s_addr = htonl(INADDR_ANY); @@ -480,7 +489,7 @@ static int geneve_gro_complete(struct sk_buff *skb, int nhoff, /* Create new listen socket if needed */ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, - bool ipv6) + bool ipv6, u32 flags) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_sock *gs; @@ -492,7 +501,7 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, if (!gs) return ERR_PTR(-ENOMEM); - sock = geneve_create_sock(net, ipv6, port); + sock = geneve_create_sock(net, ipv6, port, flags); if (IS_ERR(sock)) { kfree(gs); return ERR_CAST(sock); @@ -575,12 +584,13 @@ static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6) goto out; } - gs = geneve_socket_create(net, geneve->dst_port, ipv6); + gs = geneve_socket_create(net, geneve->dst_port, ipv6, geneve->flags); if (IS_ERR(gs)) return PTR_ERR(gs); out: gs->collect_md = geneve->collect_md; + gs->flags = geneve->flags; #if IS_ENABLED(CONFIG_IPV6) if (ipv6) geneve->sock6 = gs; @@ -642,11 +652,12 @@ static void geneve_build_header(struct genevehdr *geneveh, static int geneve_build_skb(struct rtable *rt, struct sk_buff *skb, __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, - bool csum, bool xnet) + u32 flags, bool xnet) { struct genevehdr *gnvh; int min_headroom; int err; + bool udp_sum = !!(flags & GENEVE_F_UDP_CSUM); skb_scrub_packet(skb, xnet); @@ -658,7 +669,7 @@ static int geneve_build_skb(struct rtable *rt, struct sk_buff *skb, goto free_rt; } - skb = udp_tunnel_handle_offloads(skb, csum); + skb = udp_tunnel_handle_offloads(skb, udp_sum); if (IS_ERR(skb)) { err = PTR_ERR(skb); goto free_rt; @@ -678,11 +689,12 @@ free_rt: #if IS_ENABLED(CONFIG_IPV6) static int geneve6_build_skb(struct dst_entry *dst, struct sk_buff *skb, __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, - bool csum, bool xnet) + u32 flags, bool xnet) { struct genevehdr *gnvh; int min_headroom; int err; + bool udp_sum = !(flags & GENEVE_F_UDP_ZERO_CSUM6_TX); skb_scrub_packet(skb, xnet); @@ -694,7 +706,7 @@ static int geneve6_build_skb(struct dst_entry *dst, struct sk_buff *skb, goto free_dst; } - skb = udp_tunnel_handle_offloads(skb, csum); + skb = udp_tunnel_handle_offloads(skb, udp_sum); if (IS_ERR(skb)) { err = PTR_ERR(skb); goto free_dst; @@ -824,9 +836,9 @@ static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct flowi4 fl4; __u8 tos, ttl; __be16 sport; - bool udp_csum; __be16 df; bool xnet = !net_eq(geneve->net, dev_net(geneve->dev)); + u32 flags = geneve->flags; if (geneve->collect_md) { if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) { @@ -857,9 +869,13 @@ static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, if (key->tun_flags & TUNNEL_GENEVE_OPT) opts = ip_tunnel_info_opts(info); - udp_csum = !!(key->tun_flags & TUNNEL_CSUM); + if (key->tun_flags & TUNNEL_CSUM) + flags |= GENEVE_F_UDP_CSUM; + else + flags &= ~GENEVE_F_UDP_CSUM; + err = geneve_build_skb(rt, skb, key->tun_flags, vni, - info->options_len, opts, udp_csum, xnet); + info->options_len, opts, flags, xnet); if (unlikely(err)) goto err; @@ -867,9 +883,8 @@ static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, ttl = key->ttl; df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; } else { - udp_csum = false; err = geneve_build_skb(rt, skb, 0, geneve->vni, - 0, NULL, udp_csum, xnet); + 0, NULL, flags, xnet); if (unlikely(err)) goto err; @@ -883,7 +898,7 @@ static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, err = udp_tunnel_xmit_skb(rt, gs4->sock->sk, skb, fl4.saddr, fl4.daddr, tos, ttl, df, sport, geneve->dst_port, !net_eq(geneve->net, dev_net(geneve->dev)), - !udp_csum); + !(flags & GENEVE_F_UDP_CSUM)); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return NETDEV_TX_OK; @@ -912,8 +927,8 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct flowi6 fl6; __u8 prio, ttl; __be16 sport; - bool udp_csum; bool xnet = !net_eq(geneve->net, dev_net(geneve->dev)); + u32 flags = geneve->flags; if (geneve->collect_md) { if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) { @@ -942,19 +957,22 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, if (key->tun_flags & TUNNEL_GENEVE_OPT) opts = ip_tunnel_info_opts(info); - udp_csum = !!(key->tun_flags & TUNNEL_CSUM); + if (key->tun_flags & TUNNEL_CSUM) + flags |= GENEVE_F_UDP_CSUM; + else + flags &= ~GENEVE_F_UDP_CSUM; + err = geneve6_build_skb(dst, skb, key->tun_flags, vni, info->options_len, opts, - udp_csum, xnet); + flags, xnet); if (unlikely(err)) goto err; prio = ip_tunnel_ecn_encap(key->tos, iip, skb); ttl = key->ttl; } else { - udp_csum = false; err = geneve6_build_skb(dst, skb, 0, geneve->vni, - 0, NULL, udp_csum, xnet); + 0, NULL, flags, xnet); if (unlikely(err)) goto err; @@ -966,7 +984,8 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, } err = udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev, &fl6.saddr, &fl6.daddr, prio, ttl, - sport, geneve->dst_port, !udp_csum); + sport, geneve->dst_port, + !!(flags & GENEVE_F_UDP_ZERO_CSUM6_TX)); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return NETDEV_TX_OK; @@ -1099,6 +1118,9 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = { [IFLA_GENEVE_TOS] = { .type = NLA_U8 }, [IFLA_GENEVE_PORT] = { .type = NLA_U16 }, [IFLA_GENEVE_COLLECT_METADATA] = { .type = NLA_FLAG }, + [IFLA_GENEVE_UDP_CSUM] = { .type = NLA_U8 }, + [IFLA_GENEVE_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, + [IFLA_GENEVE_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, }; static int geneve_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -1152,7 +1174,7 @@ static struct geneve_dev *geneve_find_dev(struct geneve_net *gn, static int geneve_configure(struct net *net, struct net_device *dev, union geneve_addr *remote, __u32 vni, __u8 ttl, __u8 tos, __be16 dst_port, - bool metadata) + bool metadata, u32 flags) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev *t, *geneve = netdev_priv(dev); @@ -1183,6 +1205,7 @@ static int geneve_configure(struct net *net, struct net_device *dev, geneve->tos = tos; geneve->dst_port = dst_port; geneve->collect_md = metadata; + geneve->flags = flags; t = geneve_find_dev(gn, dst_port, remote, geneve->vni, &tun_on_same_port, &tun_collect_md); @@ -1213,6 +1236,7 @@ static int geneve_newlink(struct net *net, struct net_device *dev, bool metadata = false; union geneve_addr remote = geneve_remote_unspec; __u32 vni = 0; + u32 flags = 0; if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6]) return -EINVAL; @@ -1253,8 +1277,20 @@ static int geneve_newlink(struct net *net, struct net_device *dev, if (data[IFLA_GENEVE_COLLECT_METADATA]) metadata = true; + if (data[IFLA_GENEVE_UDP_CSUM] && + nla_get_u8(data[IFLA_GENEVE_UDP_CSUM])) + flags |= GENEVE_F_UDP_CSUM; + + if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX] && + nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX])) + flags |= GENEVE_F_UDP_ZERO_CSUM6_TX; + + if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX] && + nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX])) + flags |= GENEVE_F_UDP_ZERO_CSUM6_RX; + return geneve_configure(net, dev, &remote, vni, ttl, tos, dst_port, - metadata); + metadata, flags); } static void geneve_dellink(struct net_device *dev, struct list_head *head) @@ -1273,6 +1309,9 @@ static size_t geneve_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TOS */ nla_total_size(sizeof(__be16)) + /* IFLA_GENEVE_PORT */ nla_total_size(0) + /* IFLA_GENEVE_COLLECT_METADATA */ + nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */ + nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_TX */ + nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX */ 0; } @@ -1309,6 +1348,14 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; } + if (nla_put_u8(skb, IFLA_GENEVE_UDP_CSUM, + !!(geneve->flags & GENEVE_F_UDP_CSUM)) || + nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_TX, + !!(geneve->flags & GENEVE_F_UDP_ZERO_CSUM6_TX)) || + nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_RX, + !!(geneve->flags & GENEVE_F_UDP_ZERO_CSUM6_RX))) + goto nla_put_failure; + return 0; nla_put_failure: @@ -1342,7 +1389,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name, return dev; err = geneve_configure(net, dev, &geneve_remote_unspec, - 0, 0, 0, htons(dst_port), true); + 0, 0, 0, htons(dst_port), true, 0); if (err) { free_netdev(dev); return ERR_PTR(err); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 5ad57375a99f..2be1dd5a103f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -462,6 +462,9 @@ enum { IFLA_GENEVE_PORT, /* destination port */ IFLA_GENEVE_COLLECT_METADATA, IFLA_GENEVE_REMOTE6, + IFLA_GENEVE_UDP_CSUM, + IFLA_GENEVE_UDP_ZERO_CSUM6_TX, + IFLA_GENEVE_UDP_ZERO_CSUM6_RX, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) -- cgit v1.2.3 From 4ec8ff0edccffe7a77f18e2a1e2ce86f03e08b5c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Dec 2015 17:38:54 -0500 Subject: netfilter: prepare xt_cgroup for multi revisions xt_cgroup will grow cgroup2 path based match. Postfix existing symbols with _v0 and prepare for multi revision registration. Signed-off-by: Tejun Heo Cc: Daniel Borkmann Cc: Daniel Wagner CC: Neil Horman Cc: Jan Engelhardt Cc: Pablo Neira Ayuso Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_cgroup.h | 2 +- net/netfilter/xt_cgroup.c | 36 +++++++++++++++++--------------- 2 files changed, 20 insertions(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/xt_cgroup.h b/include/uapi/linux/netfilter/xt_cgroup.h index 43acb7e175f6..577c9e0b9406 100644 --- a/include/uapi/linux/netfilter/xt_cgroup.h +++ b/include/uapi/linux/netfilter/xt_cgroup.h @@ -3,7 +3,7 @@ #include -struct xt_cgroup_info { +struct xt_cgroup_info_v0 { __u32 id; __u32 invert; }; diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index 54eaeb45ce99..17300256772a 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -24,9 +24,9 @@ MODULE_DESCRIPTION("Xtables: process control group matching"); MODULE_ALIAS("ipt_cgroup"); MODULE_ALIAS("ip6t_cgroup"); -static int cgroup_mt_check(const struct xt_mtchk_param *par) +static int cgroup_mt_check_v0(const struct xt_mtchk_param *par) { - struct xt_cgroup_info *info = par->matchinfo; + struct xt_cgroup_info_v0 *info = par->matchinfo; if (info->invert & ~1) return -EINVAL; @@ -35,9 +35,9 @@ static int cgroup_mt_check(const struct xt_mtchk_param *par) } static bool -cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) +cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { - const struct xt_cgroup_info *info = par->matchinfo; + const struct xt_cgroup_info_v0 *info = par->matchinfo; if (skb->sk == NULL || !sk_fullsock(skb->sk)) return false; @@ -46,27 +46,29 @@ cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) info->invert; } -static struct xt_match cgroup_mt_reg __read_mostly = { - .name = "cgroup", - .revision = 0, - .family = NFPROTO_UNSPEC, - .checkentry = cgroup_mt_check, - .match = cgroup_mt, - .matchsize = sizeof(struct xt_cgroup_info), - .me = THIS_MODULE, - .hooks = (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_POST_ROUTING) | - (1 << NF_INET_LOCAL_IN), +static struct xt_match cgroup_mt_reg[] __read_mostly = { + { + .name = "cgroup", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = cgroup_mt_check_v0, + .match = cgroup_mt_v0, + .matchsize = sizeof(struct xt_cgroup_info_v0), + .me = THIS_MODULE, + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), + }, }; static int __init cgroup_mt_init(void) { - return xt_register_match(&cgroup_mt_reg); + return xt_register_matches(cgroup_mt_reg, ARRAY_SIZE(cgroup_mt_reg)); } static void __exit cgroup_mt_exit(void) { - xt_unregister_match(&cgroup_mt_reg); + xt_unregister_matches(cgroup_mt_reg, ARRAY_SIZE(cgroup_mt_reg)); } module_init(cgroup_mt_init); -- cgit v1.2.3 From c38c4597e4bf3e99860eac98211748e1ecb0e139 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Dec 2015 17:38:55 -0500 Subject: netfilter: implement xt_cgroup cgroup2 path match This patch implements xt_cgroup path match which matches cgroup2 membership of the associated socket. The match is recursive and invertible. For rationales on introducing another cgroup based match, please refer to a preceding commit "sock, cgroup: add sock->sk_cgroup". v3: Folded into xt_cgroup as a new revision interface as suggested by Pablo. v2: Included linux/limits.h from xt_cgroup2.h for PATH_MAX. Added explicit alignment to the priv field. Both suggested by Jan. Signed-off-by: Tejun Heo Cc: Daniel Borkmann Cc: Daniel Wagner CC: Neil Horman Cc: Jan Engelhardt Cc: Pablo Neira Ayuso Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_cgroup.h | 13 ++++++ net/netfilter/xt_cgroup.c | 69 ++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/xt_cgroup.h b/include/uapi/linux/netfilter/xt_cgroup.h index 577c9e0b9406..1e4b37b93bef 100644 --- a/include/uapi/linux/netfilter/xt_cgroup.h +++ b/include/uapi/linux/netfilter/xt_cgroup.h @@ -2,10 +2,23 @@ #define _UAPI_XT_CGROUP_H #include +#include struct xt_cgroup_info_v0 { __u32 id; __u32 invert; }; +struct xt_cgroup_info_v1 { + __u8 has_path; + __u8 has_classid; + __u8 invert_path; + __u8 invert_classid; + char path[PATH_MAX]; + __u32 classid; + + /* kernel internal data */ + void *priv __attribute__((aligned(8))); +}; + #endif /* _UAPI_XT_CGROUP_H */ diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index 17300256772a..a086a914865f 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -34,6 +34,37 @@ static int cgroup_mt_check_v0(const struct xt_mtchk_param *par) return 0; } +static int cgroup_mt_check_v1(const struct xt_mtchk_param *par) +{ + struct xt_cgroup_info_v1 *info = par->matchinfo; + struct cgroup *cgrp; + + if ((info->invert_path & ~1) || (info->invert_classid & ~1)) + return -EINVAL; + + if (!info->has_path && !info->has_classid) { + pr_info("xt_cgroup: no path or classid specified\n"); + return -EINVAL; + } + + if (info->has_path && info->has_classid) { + pr_info("xt_cgroup: both path and classid specified\n"); + return -EINVAL; + } + + if (info->has_path) { + cgrp = cgroup_get_from_path(info->path); + if (IS_ERR(cgrp)) { + pr_info("xt_cgroup: invalid path, errno=%ld\n", + PTR_ERR(cgrp)); + return -EINVAL; + } + info->priv = cgrp; + } + + return 0; +} + static bool cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { @@ -46,6 +77,31 @@ cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) info->invert; } +static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_cgroup_info_v1 *info = par->matchinfo; + struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data; + struct cgroup *ancestor = info->priv; + + if (!skb->sk || !sk_fullsock(skb->sk)) + return false; + + if (ancestor) + return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ + info->invert_path; + else + return (info->classid == sock_cgroup_classid(skcd)) ^ + info->invert_classid; +} + +static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par) +{ + struct xt_cgroup_info_v1 *info = par->matchinfo; + + if (info->priv) + cgroup_put(info->priv); +} + static struct xt_match cgroup_mt_reg[] __read_mostly = { { .name = "cgroup", @@ -59,6 +115,19 @@ static struct xt_match cgroup_mt_reg[] __read_mostly = { (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), }, + { + .name = "cgroup", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = cgroup_mt_check_v1, + .match = cgroup_mt_v1, + .matchsize = sizeof(struct xt_cgroup_info_v1), + .destroy = cgroup_mt_destroy_v1, + .me = THIS_MODULE, + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), + }, }; static int __init cgroup_mt_init(void) -- cgit v1.2.3 From 7f00feaf107645d95a6d87e99b4d141ac0a08efd Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 15 Dec 2015 15:41:38 -0800 Subject: ila: Add generic ILA translation facility This patch implements an ILA tanslation table. This table can be configured with identifier to locator mappings, and can be be queried to resolve a mapping. Queries can be parameterized based on interface, direction (incoming or outoing), and matching locator. The table is implemented using rhashtable and is configured via netlink (through "ip ila .." in iproute). The table may be used as alternative means to do do ILA tanslations other than the lw tunnels Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/ila.h | 18 ++ include/uapi/linux/ila.h | 22 ++ net/ipv6/ila/Makefile | 2 +- net/ipv6/ila/ila.h | 2 + net/ipv6/ila/ila_common.c | 8 + net/ipv6/ila/ila_xlat.c | 680 ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 731 insertions(+), 1 deletion(-) create mode 100644 include/net/ila.h create mode 100644 net/ipv6/ila/ila_xlat.c (limited to 'include/uapi/linux') diff --git a/include/net/ila.h b/include/net/ila.h new file mode 100644 index 000000000000..9f4f43e94ae4 --- /dev/null +++ b/include/net/ila.h @@ -0,0 +1,18 @@ +/* + * ILA kernel interface + * + * Copyright (c) 2015 Tom Herbert + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + */ + +#ifndef _NET_ILA_H +#define _NET_ILA_H + +int ila_xlat_outgoing(struct sk_buff *skb); +int ila_xlat_incoming(struct sk_buff *skb); + +#endif /* _NET_ILA_H */ diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h index 7ed9e670814e..abde7bbd6f3b 100644 --- a/include/uapi/linux/ila.h +++ b/include/uapi/linux/ila.h @@ -3,13 +3,35 @@ #ifndef _UAPI_LINUX_ILA_H #define _UAPI_LINUX_ILA_H +/* NETLINK_GENERIC related info */ +#define ILA_GENL_NAME "ila" +#define ILA_GENL_VERSION 0x1 + enum { ILA_ATTR_UNSPEC, ILA_ATTR_LOCATOR, /* u64 */ + ILA_ATTR_IDENTIFIER, /* u64 */ + ILA_ATTR_LOCATOR_MATCH, /* u64 */ + ILA_ATTR_IFINDEX, /* s32 */ + ILA_ATTR_DIR, /* u32 */ __ILA_ATTR_MAX, }; #define ILA_ATTR_MAX (__ILA_ATTR_MAX - 1) +enum { + ILA_CMD_UNSPEC, + ILA_CMD_ADD, + ILA_CMD_DEL, + ILA_CMD_GET, + + __ILA_CMD_MAX, +}; + +#define ILA_CMD_MAX (__ILA_CMD_MAX - 1) + +#define ILA_DIR_IN (1 << 0) +#define ILA_DIR_OUT (1 << 1) + #endif /* _UAPI_LINUX_ILA_H */ diff --git a/net/ipv6/ila/Makefile b/net/ipv6/ila/Makefile index 31d136be2f21..4b32e5921e5c 100644 --- a/net/ipv6/ila/Makefile +++ b/net/ipv6/ila/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_IPV6_ILA) += ila.o -ila-objs := ila_common.o ila_lwt.o +ila-objs := ila_common.o ila_lwt.o ila_xlat.o diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index b94081ff2f8a..28542cb2b387 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -42,5 +42,7 @@ void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p); int ila_lwt_init(void); void ila_lwt_fini(void); +int ila_xlat_init(void); +void ila_xlat_fini(void); #endif /* __ILA_H */ diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index 64e1904991ac..32dc9aab7297 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -80,12 +80,20 @@ static int __init ila_init(void) if (ret) goto fail_lwt; + ret = ila_xlat_init(); + if (ret) + goto fail_xlat; + + return 0; +fail_xlat: + ila_lwt_fini(); fail_lwt: return ret; } static void __exit ila_fini(void) { + ila_xlat_fini(); ila_lwt_fini(); } diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c new file mode 100644 index 000000000000..295ca29a23c3 --- /dev/null +++ b/net/ipv6/ila/ila_xlat.c @@ -0,0 +1,680 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ila.h" + +struct ila_xlat_params { + struct ila_params ip; + __be64 identifier; + int ifindex; + unsigned int dir; +}; + +struct ila_map { + struct ila_xlat_params p; + struct rhash_head node; + struct ila_map __rcu *next; + struct rcu_head rcu; +}; + +static unsigned int ila_net_id; + +struct ila_net { + struct rhashtable rhash_table; + spinlock_t *locks; /* Bucket locks for entry manipulation */ + unsigned int locks_mask; + bool hooks_registered; +}; + +#define LOCKS_PER_CPU 10 + +static int alloc_ila_locks(struct ila_net *ilan) +{ + unsigned int i, size; + unsigned int nr_pcpus = num_possible_cpus(); + + nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL); + size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU); + + if (sizeof(spinlock_t) != 0) { +#ifdef CONFIG_NUMA + if (size * sizeof(spinlock_t) > PAGE_SIZE) + ilan->locks = vmalloc(size * sizeof(spinlock_t)); + else +#endif + ilan->locks = kmalloc_array(size, sizeof(spinlock_t), + GFP_KERNEL); + if (!ilan->locks) + return -ENOMEM; + for (i = 0; i < size; i++) + spin_lock_init(&ilan->locks[i]); + } + ilan->locks_mask = size - 1; + + return 0; +} + +static u32 hashrnd __read_mostly; +static __always_inline void __ila_hash_secret_init(void) +{ + net_get_random_once(&hashrnd, sizeof(hashrnd)); +} + +static inline u32 ila_identifier_hash(__be64 identifier) +{ + u32 *v = (u32 *)&identifier; + + return jhash_2words(v[0], v[1], hashrnd); +} + +static inline spinlock_t *ila_get_lock(struct ila_net *ilan, __be64 identifier) +{ + return &ilan->locks[ila_identifier_hash(identifier) & ilan->locks_mask]; +} + +static inline int ila_cmp_wildcards(struct ila_map *ila, __be64 loc, + int ifindex, unsigned int dir) +{ + return (ila->p.ip.locator_match && ila->p.ip.locator_match != loc) || + (ila->p.ifindex && ila->p.ifindex != ifindex) || + !(ila->p.dir & dir); +} + +static inline int ila_cmp_params(struct ila_map *ila, struct ila_xlat_params *p) +{ + return (ila->p.ip.locator_match != p->ip.locator_match) || + (ila->p.ifindex != p->ifindex) || + (ila->p.dir != p->dir); +} + +static int ila_cmpfn(struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct ila_map *ila = obj; + + return (ila->p.identifier != *(__be64 *)arg->key); +} + +static inline int ila_order(struct ila_map *ila) +{ + int score = 0; + + if (ila->p.ip.locator_match) + score += 1 << 0; + + if (ila->p.ifindex) + score += 1 << 1; + + return score; +} + +static const struct rhashtable_params rht_params = { + .nelem_hint = 1024, + .head_offset = offsetof(struct ila_map, node), + .key_offset = offsetof(struct ila_map, p.identifier), + .key_len = sizeof(u64), /* identifier */ + .max_size = 1048576, + .min_size = 256, + .automatic_shrinking = true, + .obj_cmpfn = ila_cmpfn, +}; + +static struct genl_family ila_nl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = ILA_GENL_NAME, + .version = ILA_GENL_VERSION, + .maxattr = ILA_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, +}; + +static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { + [ILA_ATTR_IDENTIFIER] = { .type = NLA_U64, }, + [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, + [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, }, + [ILA_ATTR_IFINDEX] = { .type = NLA_U32, }, + [ILA_ATTR_DIR] = { .type = NLA_U32, }, +}; + +static int parse_nl_config(struct genl_info *info, + struct ila_xlat_params *p) +{ + memset(p, 0, sizeof(*p)); + + if (info->attrs[ILA_ATTR_IDENTIFIER]) + p->identifier = (__force __be64)nla_get_u64( + info->attrs[ILA_ATTR_IDENTIFIER]); + + if (info->attrs[ILA_ATTR_LOCATOR]) + p->ip.locator = (__force __be64)nla_get_u64( + info->attrs[ILA_ATTR_LOCATOR]); + + if (info->attrs[ILA_ATTR_LOCATOR_MATCH]) + p->ip.locator_match = (__force __be64)nla_get_u64( + info->attrs[ILA_ATTR_LOCATOR_MATCH]); + + if (info->attrs[ILA_ATTR_IFINDEX]) + p->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]); + + if (info->attrs[ILA_ATTR_DIR]) + p->dir = nla_get_u32(info->attrs[ILA_ATTR_DIR]); + + return 0; +} + +/* Must be called with rcu readlock */ +static inline struct ila_map *ila_lookup_wildcards(__be64 id, __be64 loc, + int ifindex, + unsigned int dir, + struct ila_net *ilan) +{ + struct ila_map *ila; + + ila = rhashtable_lookup_fast(&ilan->rhash_table, &id, rht_params); + while (ila) { + if (!ila_cmp_wildcards(ila, loc, ifindex, dir)) + return ila; + ila = rcu_access_pointer(ila->next); + } + + return NULL; +} + +/* Must be called with rcu readlock */ +static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *p, + struct ila_net *ilan) +{ + struct ila_map *ila; + + ila = rhashtable_lookup_fast(&ilan->rhash_table, &p->identifier, + rht_params); + while (ila) { + if (!ila_cmp_params(ila, p)) + return ila; + ila = rcu_access_pointer(ila->next); + } + + return NULL; +} + +static inline void ila_release(struct ila_map *ila) +{ + kfree_rcu(ila, rcu); +} + +static void ila_free_cb(void *ptr, void *arg) +{ + struct ila_map *ila = (struct ila_map *)ptr, *next; + + /* Assume rcu_readlock held */ + while (ila) { + next = rcu_access_pointer(ila->next); + ila_release(ila); + ila = next; + } +} + +static int ila_xlat_addr(struct sk_buff *skb, int dir); + +static unsigned int +ila_nf_input(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + ila_xlat_addr(skb, ILA_DIR_IN); + return NF_ACCEPT; +} + +static struct nf_hook_ops ila_nf_hook_ops[] __read_mostly = { + { + .hook = ila_nf_input, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = -1, + }, +}; + +static int ila_add_mapping(struct net *net, struct ila_xlat_params *p) +{ + struct ila_net *ilan = net_generic(net, ila_net_id); + struct ila_map *ila, *head; + spinlock_t *lock = ila_get_lock(ilan, p->identifier); + int err = 0, order; + + if (!ilan->hooks_registered) { + /* We defer registering net hooks in the namespace until the + * first mapping is added. + */ + err = nf_register_net_hooks(net, ila_nf_hook_ops, + ARRAY_SIZE(ila_nf_hook_ops)); + if (err) + return err; + + ilan->hooks_registered = true; + } + + ila = kzalloc(sizeof(*ila), GFP_KERNEL); + if (!ila) + return -ENOMEM; + + ila->p = *p; + + if (p->ip.locator_match) { + /* Precompute checksum difference for translation since we + * know both the old identifier and the new one. + */ + ila->p.ip.csum_diff = compute_csum_diff8( + (__be32 *)&p->ip.locator_match, + (__be32 *)&p->ip.locator); + } + + order = ila_order(ila); + + spin_lock(lock); + + head = rhashtable_lookup_fast(&ilan->rhash_table, &p->identifier, + rht_params); + if (!head) { + /* New entry for the rhash_table */ + err = rhashtable_lookup_insert_fast(&ilan->rhash_table, + &ila->node, rht_params); + } else { + struct ila_map *tila = head, *prev = NULL; + + do { + if (!ila_cmp_params(tila, p)) { + err = -EEXIST; + goto out; + } + + if (order > ila_order(tila)) + break; + + prev = tila; + tila = rcu_dereference_protected(tila->next, + lockdep_is_held(lock)); + } while (tila); + + if (prev) { + /* Insert in sub list of head */ + RCU_INIT_POINTER(ila->next, tila); + rcu_assign_pointer(prev->next, ila); + } else { + /* Make this ila new head */ + RCU_INIT_POINTER(ila->next, head); + err = rhashtable_replace_fast(&ilan->rhash_table, + &head->node, + &ila->node, rht_params); + if (err) + goto out; + } + } + +out: + spin_unlock(lock); + + if (err) + kfree(ila); + + return err; +} + +static int ila_del_mapping(struct net *net, struct ila_xlat_params *p) +{ + struct ila_net *ilan = net_generic(net, ila_net_id); + struct ila_map *ila, *head, *prev; + spinlock_t *lock = ila_get_lock(ilan, p->identifier); + int err = -ENOENT; + + spin_lock(lock); + + head = rhashtable_lookup_fast(&ilan->rhash_table, + &p->identifier, rht_params); + ila = head; + + prev = NULL; + + while (ila) { + if (ila_cmp_params(ila, p)) { + prev = ila; + ila = rcu_dereference_protected(ila->next, + lockdep_is_held(lock)); + continue; + } + + err = 0; + + if (prev) { + /* Not head, just delete from list */ + rcu_assign_pointer(prev->next, ila->next); + } else { + /* It is the head. If there is something in the + * sublist we need to make a new head. + */ + head = rcu_dereference_protected(ila->next, + lockdep_is_held(lock)); + if (head) { + /* Put first entry in the sublist into the + * table + */ + err = rhashtable_replace_fast( + &ilan->rhash_table, &ila->node, + &head->node, rht_params); + if (err) + goto out; + } else { + /* Entry no longer used */ + err = rhashtable_remove_fast(&ilan->rhash_table, + &ila->node, + rht_params); + } + } + + ila_release(ila); + + break; + } + +out: + spin_unlock(lock); + + return err; +} + +static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct ila_xlat_params p; + int err; + + err = parse_nl_config(info, &p); + if (err) + return err; + + return ila_add_mapping(net, &p); +} + +static int ila_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct ila_xlat_params p; + int err; + + err = parse_nl_config(info, &p); + if (err) + return err; + + ila_del_mapping(net, &p); + + return 0; +} + +static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg) +{ + if (nla_put_u64(msg, ILA_ATTR_IDENTIFIER, + (__force u64)ila->p.identifier) || + nla_put_u64(msg, ILA_ATTR_LOCATOR, + (__force u64)ila->p.ip.locator) || + nla_put_u64(msg, ILA_ATTR_LOCATOR_MATCH, + (__force u64)ila->p.ip.locator_match) || + nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->p.ifindex) || + nla_put_u32(msg, ILA_ATTR_DIR, ila->p.dir)) + return -1; + + return 0; +} + +static int ila_dump_info(struct ila_map *ila, + u32 portid, u32 seq, u32 flags, + struct sk_buff *skb, u8 cmd) +{ + void *hdr; + + hdr = genlmsg_put(skb, portid, seq, &ila_nl_family, flags, cmd); + if (!hdr) + return -ENOMEM; + + if (ila_fill_info(ila, skb) < 0) + goto nla_put_failure; + + genlmsg_end(skb, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ila_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct ila_net *ilan = net_generic(net, ila_net_id); + struct sk_buff *msg; + struct ila_xlat_params p; + struct ila_map *ila; + int ret; + + ret = parse_nl_config(info, &p); + if (ret) + return ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + rcu_read_lock(); + + ila = ila_lookup_by_params(&p, ilan); + if (ila) { + ret = ila_dump_info(ila, + info->snd_portid, + info->snd_seq, 0, msg, + info->genlhdr->cmd); + } + + rcu_read_unlock(); + + if (ret < 0) + goto out_free; + + return genlmsg_reply(msg, info); + +out_free: + nlmsg_free(msg); + return ret; +} + +struct ila_dump_iter { + struct rhashtable_iter rhiter; +}; + +static int ila_nl_dump_start(struct netlink_callback *cb) +{ + struct net *net = sock_net(cb->skb->sk); + struct ila_net *ilan = net_generic(net, ila_net_id); + struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args; + + return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter); +} + +static int ila_nl_dump_done(struct netlink_callback *cb) +{ + struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args; + + rhashtable_walk_exit(&iter->rhiter); + + return 0; +} + +static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args; + struct rhashtable_iter *rhiter = &iter->rhiter; + struct ila_map *ila; + int ret; + + ret = rhashtable_walk_start(rhiter); + if (ret && ret != -EAGAIN) + goto done; + + for (;;) { + ila = rhashtable_walk_next(rhiter); + + if (IS_ERR(ila)) { + if (PTR_ERR(ila) == -EAGAIN) + continue; + ret = PTR_ERR(ila); + goto done; + } else if (!ila) { + break; + } + + while (ila) { + ret = ila_dump_info(ila, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + skb, ILA_CMD_GET); + if (ret) + goto done; + + ila = rcu_access_pointer(ila->next); + } + } + + ret = skb->len; + +done: + rhashtable_walk_stop(rhiter); + return ret; +} + +static const struct genl_ops ila_nl_ops[] = { + { + .cmd = ILA_CMD_ADD, + .doit = ila_nl_cmd_add_mapping, + .policy = ila_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = ILA_CMD_DEL, + .doit = ila_nl_cmd_del_mapping, + .policy = ila_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = ILA_CMD_GET, + .doit = ila_nl_cmd_get_mapping, + .start = ila_nl_dump_start, + .dumpit = ila_nl_dump, + .done = ila_nl_dump_done, + .policy = ila_nl_policy, + }, +}; + +#define ILA_HASH_TABLE_SIZE 1024 + +static __net_init int ila_init_net(struct net *net) +{ + int err; + struct ila_net *ilan = net_generic(net, ila_net_id); + + err = alloc_ila_locks(ilan); + if (err) + return err; + + rhashtable_init(&ilan->rhash_table, &rht_params); + + return 0; +} + +static __net_exit void ila_exit_net(struct net *net) +{ + struct ila_net *ilan = net_generic(net, ila_net_id); + + rhashtable_free_and_destroy(&ilan->rhash_table, ila_free_cb, NULL); + + kvfree(ilan->locks); + + if (ilan->hooks_registered) + nf_unregister_net_hooks(net, ila_nf_hook_ops, + ARRAY_SIZE(ila_nf_hook_ops)); +} + +static struct pernet_operations ila_net_ops = { + .init = ila_init_net, + .exit = ila_exit_net, + .id = &ila_net_id, + .size = sizeof(struct ila_net), +}; + +static int ila_xlat_addr(struct sk_buff *skb, int dir) +{ + struct ila_map *ila; + struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct net *net = dev_net(skb->dev); + struct ila_net *ilan = net_generic(net, ila_net_id); + __be64 identifier, locator_match; + size_t nhoff; + + /* Assumes skb contains a valid IPv6 header that is pulled */ + + identifier = *(__be64 *)&ip6h->daddr.in6_u.u6_addr8[8]; + locator_match = *(__be64 *)&ip6h->daddr.in6_u.u6_addr8[0]; + nhoff = sizeof(struct ipv6hdr); + + rcu_read_lock(); + + ila = ila_lookup_wildcards(identifier, locator_match, + skb->dev->ifindex, dir, ilan); + if (ila) + update_ipv6_locator(skb, &ila->p.ip); + + rcu_read_unlock(); + + return 0; +} + +int ila_xlat_incoming(struct sk_buff *skb) +{ + return ila_xlat_addr(skb, ILA_DIR_IN); +} +EXPORT_SYMBOL(ila_xlat_incoming); + +int ila_xlat_outgoing(struct sk_buff *skb) +{ + return ila_xlat_addr(skb, ILA_DIR_OUT); +} +EXPORT_SYMBOL(ila_xlat_outgoing); + +int ila_xlat_init(void) +{ + int ret; + + ret = register_pernet_device(&ila_net_ops); + if (ret) + goto exit; + + ret = genl_register_family_with_ops(&ila_nl_family, + ila_nl_ops); + if (ret < 0) + goto unregister; + + return 0; + +unregister: + unregister_pernet_device(&ila_net_ops); +exit: + return ret; +} + +void ila_xlat_fini(void) +{ + genl_unregister_family(&ila_nl_family); + unregister_pernet_device(&ila_net_ops); +} -- cgit v1.2.3 From 64be0aed59ad519d6f2160868734f7e278290ac1 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 16 Dec 2015 12:30:03 +0900 Subject: net: diag: Add the ability to destroy a socket. This patch adds a SOCK_DESTROY operation, a destroy function pointer to sock_diag_handler, and a diag_destroy function pointer. It does not include any implementation code. Signed-off-by: Lorenzo Colitti Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/sock_diag.h | 2 ++ include/net/sock.h | 1 + include/uapi/linux/sock_diag.h | 1 + net/core/sock_diag.c | 23 ++++++++++++++++++++--- 4 files changed, 24 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h index fddebc617469..4018b48f2b3b 100644 --- a/include/linux/sock_diag.h +++ b/include/linux/sock_diag.h @@ -15,6 +15,7 @@ struct sock_diag_handler { __u8 family; int (*dump)(struct sk_buff *skb, struct nlmsghdr *nlh); int (*get_info)(struct sk_buff *skb, struct sock *sk); + int (*destroy)(struct sk_buff *skb, struct nlmsghdr *nlh); }; int sock_diag_register(const struct sock_diag_handler *h); @@ -68,4 +69,5 @@ bool sock_diag_has_destroy_listeners(const struct sock *sk) } void sock_diag_broadcast_destroy(struct sock *sk); +int sock_diag_destroy(struct sock *sk, int err); #endif diff --git a/include/net/sock.h b/include/net/sock.h index ab0269f4b2cc..6e6e8a25d997 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1060,6 +1060,7 @@ struct proto { void (*destroy_cgroup)(struct mem_cgroup *memcg); struct cg_proto *(*proto_cgroup)(struct mem_cgroup *memcg); #endif + int (*diag_destroy)(struct sock *sk, int err); }; int proto_register(struct proto *prot, int alloc_slab); diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h index 49230d36f9ce..bae2d80034d4 100644 --- a/include/uapi/linux/sock_diag.h +++ b/include/uapi/linux/sock_diag.h @@ -4,6 +4,7 @@ #include #define SOCK_DIAG_BY_FAMILY 20 +#define SOCK_DESTROY 21 struct sock_diag_req { __u8 sdiag_family; diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 0c1d58d43f67..a996ce8c8fb2 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -214,7 +214,7 @@ void sock_diag_unregister(const struct sock_diag_handler *hnld) } EXPORT_SYMBOL_GPL(sock_diag_unregister); -static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) { int err; struct sock_diag_req *req = nlmsg_data(nlh); @@ -234,8 +234,12 @@ static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) hndl = sock_diag_handlers[req->sdiag_family]; if (hndl == NULL) err = -ENOENT; - else + else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY) err = hndl->dump(skb, nlh); + else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy) + err = hndl->destroy(skb, nlh); + else + err = -EOPNOTSUPP; mutex_unlock(&sock_diag_table_mutex); return err; @@ -261,7 +265,8 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return ret; case SOCK_DIAG_BY_FAMILY: - return __sock_diag_rcv_msg(skb, nlh); + case SOCK_DESTROY: + return __sock_diag_cmd(skb, nlh); default: return -EINVAL; } @@ -295,6 +300,18 @@ static int sock_diag_bind(struct net *net, int group) return 0; } +int sock_diag_destroy(struct sock *sk, int err) +{ + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if (!sk->sk_prot->diag_destroy) + return -EOPNOTSUPP; + + return sk->sk_prot->diag_destroy(sk, err); +} +EXPORT_SYMBOL_GPL(sock_diag_destroy); + static int __net_init diag_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { -- cgit v1.2.3 From 32bc201e1974976b7d3fea9a9b17bb7392ca6394 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 16 Dec 2015 17:50:11 +0800 Subject: ipv6: allow routes to be configured with expire values Add the support for adding expire value to routes, requested by Tom Gundersen for systemd-networkd, and NetworkManager wants it too. implement it by adding the new RTNETLINK attribute RTA_EXPIRES. Signed-off-by: Xin Long Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 1 + net/ipv6/route.c | 10 ++++++++++ 2 files changed, 11 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 123a5af4e8bb..ca764b5da86d 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -311,6 +311,7 @@ enum rtattr_type_t { RTA_PREF, RTA_ENCAP_TYPE, RTA_ENCAP, + RTA_EXPIRES, __RTA_MAX }; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c83b6a5b3604..3c8834bc822d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2709,6 +2709,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_PREF] = { .type = NLA_U8 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, + [RTA_EXPIRES] = { .type = NLA_U32 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -2809,6 +2810,15 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[RTA_ENCAP_TYPE]) cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); + if (tb[RTA_EXPIRES]) { + unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); + + if (addrconf_finite_timeout(timeout)) { + cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); + cfg->fc_flags |= RTF_EXPIRES; + } + } + err = 0; errout: return err; -- cgit v1.2.3 From 715f504b118998c41a2079a17e16bf5a8a114885 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 16 Dec 2015 17:22:47 +0100 Subject: ipv6: add IPV6_HDRINCL option for raw sockets Same as in Windows, we miss IPV6_HDRINCL for SOL_IPV6 and SOL_RAW. The SOL_IP/IP_HDRINCL is not available for IPv6 sockets. Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/uapi/linux/in6.h | 1 + net/ipv6/raw.c | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 79b12b004ade..318a4828bf98 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -196,6 +196,7 @@ struct in6_flowlabel_req { #define IPV6_IPSEC_POLICY 34 #define IPV6_XFRM_POLICY 35 +#define IPV6_HDRINCL 36 #endif /* diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 99140986e887..fa59dd7a427e 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -972,6 +972,11 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, return -EFAULT; switch (optname) { + case IPV6_HDRINCL: + if (sk->sk_type != SOCK_RAW) + return -EINVAL; + inet_sk(sk)->hdrincl = !!val; + return 0; case IPV6_CHECKSUM: if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 && level == IPPROTO_IPV6) { @@ -1016,7 +1021,8 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname, return -EOPNOTSUPP; return rawv6_seticmpfilter(sk, level, optname, optval, optlen); case SOL_IPV6: - if (optname == IPV6_CHECKSUM) + if (optname == IPV6_CHECKSUM || + optname == IPV6_HDRINCL) break; default: return ipv6_setsockopt(sk, level, optname, optval, optlen); @@ -1037,7 +1043,8 @@ static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname, return -EOPNOTSUPP; return rawv6_seticmpfilter(sk, level, optname, optval, optlen); case SOL_IPV6: - if (optname == IPV6_CHECKSUM) + if (optname == IPV6_CHECKSUM || + optname == IPV6_HDRINCL) break; default: return compat_ipv6_setsockopt(sk, level, optname, @@ -1057,6 +1064,9 @@ static int do_rawv6_getsockopt(struct sock *sk, int level, int optname, return -EFAULT; switch (optname) { + case IPV6_HDRINCL: + val = inet_sk(sk)->hdrincl; + break; case IPV6_CHECKSUM: /* * We allow getsockopt() for IPPROTO_IPV6-level @@ -1094,7 +1104,8 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname, return -EOPNOTSUPP; return rawv6_geticmpfilter(sk, level, optname, optval, optlen); case SOL_IPV6: - if (optname == IPV6_CHECKSUM) + if (optname == IPV6_CHECKSUM || + optname == IPV6_HDRINCL) break; default: return ipv6_getsockopt(sk, level, optname, optval, optlen); @@ -1115,7 +1126,8 @@ static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname, return -EOPNOTSUPP; return rawv6_geticmpfilter(sk, level, optname, optval, optlen); case SOL_IPV6: - if (optname == IPV6_CHECKSUM) + if (optname == IPV6_CHECKSUM || + optname == IPV6_HDRINCL) break; default: return compat_ipv6_getsockopt(sk, level, optname, -- cgit v1.2.3 From cc9da6cc4f56e05cc9e591459fe0192727ff58b3 Mon Sep 17 00:00:00 2001 From: Bjørn Mork Date: Wed, 16 Dec 2015 16:44:38 +0100 Subject: ipv6: addrconf: use stable address generator for ARPHRD_NONE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new address generator mode, using the stable address generator with an automatically generated secret. This is intended as a default address generator mode for device types with no EUI64 implementation. The new generator is used for ARPHRD_NONE interfaces initially, adding default IPv6 autoconf support to e.g. tun interfaces. If the addrgenmode is set to 'random', either by default or manually, and no stable secret is available, then a random secret is used as input for the stable-privacy address generator. The secret can be read and modified like manually configured secrets, using the proc interface. Modifying the secret will change the addrgen mode to 'stable-privacy' to indicate that it operates on a known secret. Existing behaviour of the 'stable-privacy' mode is kept unchanged. If a known secret is available when the device is created, then the mode will default to 'stable-privacy' as before. The mode can be manually set to 'random' but it will behave exactly like 'stable-privacy' in this case. The secret will not change. Cc: Hannes Frederic Sowa Cc: 吉藤英明 Signed-off-by: Bjørn Mork Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/ipv6/addrconf.c | 45 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 40 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 2be1dd5a103f..a30b78090594 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -218,6 +218,7 @@ enum in6_addr_gen_mode { IN6_ADDR_GEN_MODE_EUI64, IN6_ADDR_GEN_MODE_NONE, IN6_ADDR_GEN_MODE_STABLE_PRIVACY, + IN6_ADDR_GEN_MODE_RANDOM, }; /* Bridge section */ diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 233efa67dc3d..819b7777f3cb 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2319,6 +2319,12 @@ static void manage_tempaddrs(struct inet6_dev *idev, } } +static bool is_addr_mode_generate_stable(struct inet6_dev *idev) +{ + return idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY || + idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM; +} + void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) { struct prefix_info *pinfo; @@ -2432,8 +2438,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) in6_dev->token.s6_addr + 8, 8); read_unlock_bh(&in6_dev->lock); tokenized = true; - } else if (in6_dev->addr_gen_mode == - IN6_ADDR_GEN_MODE_STABLE_PRIVACY && + } else if (is_addr_mode_generate_stable(in6_dev) && !ipv6_generate_stable_address(&addr, 0, in6_dev)) { addr_flags |= IFA_F_STABLE_PRIVACY; @@ -3033,6 +3038,17 @@ retry: return 0; } +static void ipv6_gen_mode_random_init(struct inet6_dev *idev) +{ + struct ipv6_stable_secret *s = &idev->cnf.stable_secret; + + if (s->initialized) + return; + s = &idev->cnf.stable_secret; + get_random_bytes(&s->secret, sizeof(s->secret)); + s->initialized = true; +} + static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) { struct in6_addr addr; @@ -3043,13 +3059,18 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY) { + switch (idev->addr_gen_mode) { + case IN6_ADDR_GEN_MODE_RANDOM: + ipv6_gen_mode_random_init(idev); + /* fallthrough */ + case IN6_ADDR_GEN_MODE_STABLE_PRIVACY: if (!ipv6_generate_stable_address(&addr, 0, idev)) addrconf_add_linklocal(idev, &addr, IFA_F_STABLE_PRIVACY); else if (prefix_route) addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); - } else if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) { + break; + case IN6_ADDR_GEN_MODE_EUI64: /* addrconf_add_linklocal also adds a prefix_route and we * only need to care about prefix routes if ipv6_generate_eui64 * couldn't generate one. @@ -3058,6 +3079,11 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) addrconf_add_linklocal(idev, &addr, 0); else if (prefix_route) addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); + break; + case IN6_ADDR_GEN_MODE_NONE: + default: + /* will not add any link local address */ + break; } } @@ -3073,7 +3099,8 @@ static void addrconf_dev_config(struct net_device *dev) (dev->type != ARPHRD_INFINIBAND) && (dev->type != ARPHRD_IEEE1394) && (dev->type != ARPHRD_TUNNEL6) && - (dev->type != ARPHRD_6LOWPAN)) { + (dev->type != ARPHRD_6LOWPAN) && + (dev->type != ARPHRD_NONE)) { /* Alas, we support only Ethernet autoconfiguration. */ return; } @@ -3082,6 +3109,11 @@ static void addrconf_dev_config(struct net_device *dev) if (IS_ERR(idev)) return; + /* this device type has no EUI support */ + if (dev->type == ARPHRD_NONE && + idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) + idev->addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM; + addrconf_addr_gen(idev, false); } @@ -4926,7 +4958,8 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) if (mode != IN6_ADDR_GEN_MODE_EUI64 && mode != IN6_ADDR_GEN_MODE_NONE && - mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY) + mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY && + mode != IN6_ADDR_GEN_MODE_RANDOM) return -EINVAL; if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY && -- cgit v1.2.3 From 05c74e5e53f6cb07502c3e6a820f33e2777b6605 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 17 Dec 2015 23:51:53 +0100 Subject: bpf: add bpf_skb_load_bytes helper When hacking tc programs with eBPF, one of the issues that come up from time to time is to load addresses from headers. In eBPF as in classic BPF, we have BPF_LD | BPF_ABS | BPF_{B,H,W} instructions that extract a byte, half-word or word out of the skb data though helpers such as bpf_load_pointer() (interpreter case). F.e. extracting a whole IPv6 address could possibly look like ... union v6addr { struct { __u32 p1; __u32 p2; __u32 p3; __u32 p4; }; __u8 addr[16]; }; [...] a.p1 = htonl(load_word(skb, off)); a.p2 = htonl(load_word(skb, off + 4)); a.p3 = htonl(load_word(skb, off + 8)); a.p4 = htonl(load_word(skb, off + 12)); [...] /* access to a.addr[...] */ This work adds a complementary helper bpf_skb_load_bytes() (we also have bpf_skb_store_bytes()) as an alternative where the same call would look like from an eBPF program: ret = bpf_skb_load_bytes(skb, off, addr, sizeof(addr)); Same verifier restrictions apply as in ffeedafbf023 ("bpf: introduce current->pid, tgid, uid, gid, comm accessors") case, where stack memory access needs to be statically verified and thus guaranteed to be initialized in first use (otherwise verifier cannot tell whether a subsequent access to it is valid or not as it's runtime dependent). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9ea2d22fa2cb..8bed7f1176b8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -269,6 +269,7 @@ enum bpf_func_id { * Return: 0 on success */ BPF_FUNC_perf_event_output, + BPF_FUNC_skb_load_bytes, __BPF_FUNC_MAX_ID, }; diff --git a/net/core/filter.c b/net/core/filter.c index 672eefbfbe99..34bf6fc77c1d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1245,6 +1245,7 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) } #define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) +#define BPF_LDST_LEN 16U static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) { @@ -1252,7 +1253,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) int offset = (int) r2; void *from = (void *) (long) r3; unsigned int len = (unsigned int) r4; - char buf[16]; + char buf[BPF_LDST_LEN]; void *ptr; /* bpf verifier guarantees that: @@ -1299,6 +1300,36 @@ const struct bpf_func_proto bpf_skb_store_bytes_proto = { .arg5_type = ARG_ANYTHING, }; +static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1; + int offset = (int) r2; + void *to = (void *)(unsigned long) r3; + unsigned int len = (unsigned int) r4; + void *ptr; + + if (unlikely((u32) offset > 0xffff || len > BPF_LDST_LEN)) + return -EFAULT; + + ptr = skb_header_pointer(skb, offset, len, to); + if (unlikely(!ptr)) + return -EFAULT; + if (ptr != to) + memcpy(to, ptr, len); + + return 0; +} + +const struct bpf_func_proto bpf_skb_load_bytes_proto = { + .func = bpf_skb_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_STACK, + .arg4_type = ARG_CONST_STACK_SIZE, +}; + #define BPF_HEADER_FIELD_SIZE(flags) ((flags) & 0x0f) #define BPF_IS_PSEUDO_HEADER(flags) ((flags) & 0x10) @@ -1654,6 +1685,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) switch (func_id) { case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: -- cgit v1.2.3 From f3a4094558ddf8afa8bb58250d548e15e059c65a Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 30 Dec 2015 16:28:25 +0100 Subject: ethtool: Add phy statistics Ethernet PHYs can maintain statistics, for example errors while idle and receive errors. Add an ethtool mechanism to retrieve these statistics, using the same model as MAC statistics. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 6 ++++ include/uapi/linux/ethtool.h | 3 ++ net/core/ethtool.c | 81 +++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 89 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 05fde31b6dc6..a89cb0eef911 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -589,6 +589,12 @@ struct phy_driver { int (*module_eeprom)(struct phy_device *dev, struct ethtool_eeprom *ee, u8 *data); + /* Get statistics from the phy using ethtool */ + int (*get_sset_count)(struct phy_device *dev); + void (*get_strings)(struct phy_device *dev, u8 *data); + void (*get_stats)(struct phy_device *dev, + struct ethtool_stats *stats, u64 *data); + struct device_driver driver; }; #define to_phy_driver(d) container_of(d, struct phy_driver, driver) diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index cd1629170103..57fa39005e79 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -542,6 +542,7 @@ struct ethtool_pauseparam { * now deprecated * @ETH_SS_FEATURES: Device feature names * @ETH_SS_RSS_HASH_FUNCS: RSS hush function names + * @ETH_SS_PHY_STATS: Statistic names, for use with %ETHTOOL_GPHYSTATS */ enum ethtool_stringset { ETH_SS_TEST = 0, @@ -551,6 +552,7 @@ enum ethtool_stringset { ETH_SS_FEATURES, ETH_SS_RSS_HASH_FUNCS, ETH_SS_TUNABLES, + ETH_SS_PHY_STATS, }; /** @@ -1225,6 +1227,7 @@ enum ethtool_sfeatures_retval_bits { #define ETHTOOL_SRSSH 0x00000047 /* Set RX flow hash configuration */ #define ETHTOOL_GTUNABLE 0x00000048 /* Get tunable configuration */ #define ETHTOOL_STUNABLE 0x00000049 /* Set tunable configuration */ +#define ETHTOOL_GPHYSTATS 0x0000004a /* get PHY-specific statistics */ /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 09948a726347..daf04709dd3c 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -191,6 +191,23 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr) return ret; } +static int phy_get_sset_count(struct phy_device *phydev) +{ + int ret; + + if (phydev->drv->get_sset_count && + phydev->drv->get_strings && + phydev->drv->get_stats) { + mutex_lock(&phydev->lock); + ret = phydev->drv->get_sset_count(phydev); + mutex_unlock(&phydev->lock); + + return ret; + } + + return -EOPNOTSUPP; +} + static int __ethtool_get_sset_count(struct net_device *dev, int sset) { const struct ethtool_ops *ops = dev->ethtool_ops; @@ -204,6 +221,13 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) if (sset == ETH_SS_TUNABLES) return ARRAY_SIZE(tunable_strings); + if (sset == ETH_SS_PHY_STATS) { + if (dev->phydev) + return phy_get_sset_count(dev->phydev); + else + return -EOPNOTSUPP; + } + if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else @@ -223,7 +247,17 @@ static void __ethtool_get_strings(struct net_device *dev, sizeof(rss_hash_func_strings)); else if (stringset == ETH_SS_TUNABLES) memcpy(data, tunable_strings, sizeof(tunable_strings)); - else + else if (stringset == ETH_SS_PHY_STATS) { + struct phy_device *phydev = dev->phydev; + + if (phydev) { + mutex_lock(&phydev->lock); + phydev->drv->get_strings(phydev, data); + mutex_unlock(&phydev->lock); + } else { + return; + } + } else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); } @@ -1401,6 +1435,47 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) return ret; } +static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_stats stats; + struct phy_device *phydev = dev->phydev; + u64 *data; + int ret, n_stats; + + if (!phydev) + return -EOPNOTSUPP; + + n_stats = phy_get_sset_count(phydev); + + if (n_stats < 0) + return n_stats; + WARN_ON(n_stats == 0); + + if (copy_from_user(&stats, useraddr, sizeof(stats))) + return -EFAULT; + + stats.n_stats = n_stats; + data = kmalloc_array(n_stats, sizeof(u64), GFP_USER); + if (!data) + return -ENOMEM; + + mutex_lock(&phydev->lock); + phydev->drv->get_stats(phydev, &stats, data); + mutex_unlock(&phydev->lock); + + ret = -EFAULT; + if (copy_to_user(useraddr, &stats, sizeof(stats))) + goto out; + useraddr += sizeof(stats); + if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64))) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr) { struct ethtool_perm_addr epaddr; @@ -1779,6 +1854,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GSSET_INFO: case ETHTOOL_GSTRINGS: case ETHTOOL_GSTATS: + case ETHTOOL_GPHYSTATS: case ETHTOOL_GTSO: case ETHTOOL_GPERMADDR: case ETHTOOL_GUFO: @@ -1991,6 +2067,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_STUNABLE: rc = ethtool_set_tunable(dev, useraddr); break; + case ETHTOOL_GPHYSTATS: + rc = ethtool_get_phy_stats(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } -- cgit v1.2.3 From c7862a5f0de5f521c545f3436f0aa190964342dd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 28 Dec 2015 18:21:44 +0100 Subject: netfilter: nft_limit: allow to invert matching criteria This patch allows you to invert the ratelimit matching criteria, so you can match packets over the ratelimit. This is required to support what hashlimit does. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 6 ++++++ net/netfilter/nft_limit.c | 16 +++++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index b48a3ab761f8..22043ce95ae6 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -780,6 +780,10 @@ enum nft_limit_type { NFT_LIMIT_PKT_BYTES }; +enum nft_limit_flags { + NFT_LIMIT_F_INV = (1 << 0), +}; + /** * enum nft_limit_attributes - nf_tables limit expression netlink attributes * @@ -787,6 +791,7 @@ enum nft_limit_type { * @NFTA_LIMIT_UNIT: refill unit (NLA_U64) * @NFTA_LIMIT_BURST: burst (NLA_U32) * @NFTA_LIMIT_TYPE: type of limit (NLA_U32: enum nft_limit_type) + * @NFTA_LIMIT_FLAGS: flags (NLA_U32: enum nft_limit_flags) */ enum nft_limit_attributes { NFTA_LIMIT_UNSPEC, @@ -794,6 +799,7 @@ enum nft_limit_attributes { NFTA_LIMIT_UNIT, NFTA_LIMIT_BURST, NFTA_LIMIT_TYPE, + NFTA_LIMIT_FLAGS, __NFTA_LIMIT_MAX }; #define NFTA_LIMIT_MAX (__NFTA_LIMIT_MAX - 1) diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index 5d67938f8b2f..99d18578afc6 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -26,6 +26,7 @@ struct nft_limit { u64 rate; u64 nsecs; u32 burst; + bool invert; }; static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost) @@ -44,11 +45,11 @@ static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost) if (delta >= 0) { limit->tokens = delta; spin_unlock_bh(&limit_lock); - return false; + return limit->invert; } limit->tokens = tokens; spin_unlock_bh(&limit_lock); - return true; + return !limit->invert; } static int nft_limit_init(struct nft_limit *limit, @@ -78,6 +79,12 @@ static int nft_limit_init(struct nft_limit *limit, limit->rate = rate; } + if (tb[NFTA_LIMIT_FLAGS]) { + u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS])); + + if (flags & NFT_LIMIT_F_INV) + limit->invert = true; + } limit->last = ktime_get_ns(); return 0; @@ -86,13 +93,15 @@ static int nft_limit_init(struct nft_limit *limit, static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit, enum nft_limit_type type) { + u32 flags = limit->invert ? NFT_LIMIT_F_INV : 0; u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC); u64 rate = limit->rate - limit->burst; if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate)) || nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs)) || nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)) || - nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type))) + nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type)) || + nla_put_be32(skb, NFTA_LIMIT_FLAGS, htonl(flags))) goto nla_put_failure; return 0; @@ -120,6 +129,7 @@ static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = { [NFTA_LIMIT_UNIT] = { .type = NLA_U64 }, [NFTA_LIMIT_BURST] = { .type = NLA_U32 }, [NFTA_LIMIT_TYPE] = { .type = NLA_U32 }, + [NFTA_LIMIT_FLAGS] = { .type = NLA_U32 }, }; static int nft_limit_pkts_init(const struct nft_ctx *ctx, -- cgit v1.2.3 From 39e6dea28adc874f7021e5580c13cab0b58407ea Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 25 Nov 2015 13:39:38 +0100 Subject: netfilter: nf_tables: add forward expression to the netdev family You can use this to forward packets from ingress to the egress path of the specified interface. This provides a fast path to bounce packets from one interface to another specific destination interface. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 12 ++++ net/netfilter/Kconfig | 6 ++ net/netfilter/Makefile | 1 + net/netfilter/nft_fwd_netdev.c | 98 ++++++++++++++++++++++++++++++++ 4 files changed, 117 insertions(+) create mode 100644 net/netfilter/nft_fwd_netdev.c (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 22043ce95ae6..731288a039f6 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -983,6 +983,18 @@ enum nft_dup_attributes { }; #define NFTA_DUP_MAX (__NFTA_DUP_MAX - 1) +/** + * enum nft_fwd_attributes - nf_tables fwd expression netlink attributes + * + * @NFTA_FWD_SREG_DEV: source register of output interface (NLA_U32: nft_register) + */ +enum nft_fwd_attributes { + NFTA_FWD_UNSPEC, + NFTA_FWD_SREG_DEV, + __NFTA_FWD_MAX +}; +#define NFTA_FWD_MAX (__NFTA_FWD_MAX - 1) + /** * enum nft_gen_attributes - nf_tables ruleset generation attributes * diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 8514cc4b22a8..8c067e6663a1 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -577,6 +577,12 @@ config NFT_DUP_NETDEV help This option enables packet duplication for the "netdev" family. +config NFT_FWD_NETDEV + tristate "Netfilter nf_tables netdev packet forwarding support" + select NF_DUP_NETDEV + help + This option enables packet forwarding for the "netdev" family. + endif # NF_TABLES_NETDEV endif # NF_TABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 5c9913359537..69134541d65b 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -95,6 +95,7 @@ obj-$(CONFIG_NFT_REDIR) += nft_redir.o # nf_tables netdev obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o +obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o # generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c new file mode 100644 index 000000000000..763ebc3e0b2b --- /dev/null +++ b/net/netfilter/nft_fwd_netdev.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2015 Pablo Neira Ayuso + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct nft_fwd_netdev { + enum nft_registers sreg_dev:8; +}; + +static void nft_fwd_netdev_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_fwd_netdev *priv = nft_expr_priv(expr); + int oif = regs->data[priv->sreg_dev]; + + nf_dup_netdev_egress(pkt, oif); + regs->verdict.code = NF_DROP; +} + +static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = { + [NFTA_FWD_SREG_DEV] = { .type = NLA_U32 }, +}; + +static int nft_fwd_netdev_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_fwd_netdev *priv = nft_expr_priv(expr); + + if (tb[NFTA_FWD_SREG_DEV] == NULL) + return -EINVAL; + + priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]); + return nft_validate_register_load(priv->sreg_dev, sizeof(int)); +} + +static const struct nft_expr_ops nft_fwd_netdev_ingress_ops; + +static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_fwd_netdev *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_FWD_SREG_DEV, priv->sreg_dev)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_fwd_netdev_type; +static const struct nft_expr_ops nft_fwd_netdev_ops = { + .type = &nft_fwd_netdev_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_fwd_netdev)), + .eval = nft_fwd_netdev_eval, + .init = nft_fwd_netdev_init, + .dump = nft_fwd_netdev_dump, +}; + +static struct nft_expr_type nft_fwd_netdev_type __read_mostly = { + .family = NFPROTO_NETDEV, + .name = "fwd", + .ops = &nft_fwd_netdev_ops, + .policy = nft_fwd_netdev_policy, + .maxattr = NFTA_FWD_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_fwd_netdev_module_init(void) +{ + return nft_register_expr(&nft_fwd_netdev_type); +} + +static void __exit nft_fwd_netdev_module_exit(void) +{ + nft_unregister_expr(&nft_fwd_netdev_type); +} + +module_init(nft_fwd_netdev_module_init); +module_exit(nft_fwd_netdev_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_ALIAS_NFT_AF_EXPR(5, "fwd"); -- cgit v1.2.3 From 2fbf575867e5a181a3f3e5e29a2f0c205cca5fb3 Mon Sep 17 00:00:00 2001 From: "xypron.glpk@gmx.de" Date: Tue, 5 Jan 2016 10:12:49 +0100 Subject: include/uapi/linux/sockios.h: mark SIOCRTMSG unused IOCTL SIOCRTMSG does nothing but return EINVAL. So comment it as unused. SIOCRTMSG is only used in: * net/ipv4/af_inet.c * include/uapi/linux/sockios.h inet_ioctl calls ip_rt_ioctl. ip_rt_ioctl only handles SIOCADDRT and SIOCDELRT and returns -EINVAL otherwise. Signed-off-by: Heinrich Schuchardt Signed-off-by: David S. Miller --- include/uapi/linux/sockios.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/sockios.h b/include/uapi/linux/sockios.h index e888b1aed69f..8e7890b26d9a 100644 --- a/include/uapi/linux/sockios.h +++ b/include/uapi/linux/sockios.h @@ -27,7 +27,7 @@ /* Routing table calls. */ #define SIOCADDRT 0x890B /* add routing table entry */ #define SIOCDELRT 0x890C /* delete routing table entry */ -#define SIOCRTMSG 0x890D /* call to routing system */ +#define SIOCRTMSG 0x890D /* unused */ /* Socket configuration controls. */ #define SIOCGIFNAME 0x8910 /* get iface name */ -- cgit v1.2.3 From e6d8ecac9e68265aee9be711c5bd29406129666f Mon Sep 17 00:00:00 2001 From: Carlos Falgueras García Date: Tue, 5 Jan 2016 14:03:32 +0100 Subject: netfilter: nf_tables: Add new attributes into nft_set to store user data. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User data is stored at after 'nft_set_ops' private data into 'data[]' flexible array. The field 'udata' points to user data and 'udlen' stores its length. Add new flag NFTA_SET_USERDATA. Signed-off-by: Carlos Falgueras García Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 21 ++++++++++++++++++++- 3 files changed, 26 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 0191fbb33a2f..f6b1daf2e698 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -291,6 +291,8 @@ void nft_unregister_set(struct nft_set_ops *ops); * @timeout: default timeout value in msecs * @gc_int: garbage collection interval in msecs * @policy: set parameterization (see enum nft_set_policies) + * @udlen: user data length + * @udata: user data * @ops: set ops * @pnet: network namespace * @flags: set flags @@ -310,6 +312,8 @@ struct nft_set { u64 timeout; u32 gc_int; u16 policy; + u16 udlen; + unsigned char *udata; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; possible_net_t pnet; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 731288a039f6..03c28a402c63 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -291,6 +291,7 @@ enum nft_set_desc_attributes { * @NFTA_SET_ID: uniquely identifies a set in a transaction (NLA_U32) * @NFTA_SET_TIMEOUT: default timeout value (NLA_U64) * @NFTA_SET_GC_INTERVAL: garbage collection interval (NLA_U32) + * @NFTA_SET_USERDATA: user data (NLA_BINARY) */ enum nft_set_attributes { NFTA_SET_UNSPEC, @@ -306,6 +307,7 @@ enum nft_set_attributes { NFTA_SET_ID, NFTA_SET_TIMEOUT, NFTA_SET_GC_INTERVAL, + NFTA_SET_USERDATA, __NFTA_SET_MAX }; #define NFTA_SET_MAX (__NFTA_SET_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f5c397158e29..2011977cd79d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2323,6 +2323,8 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_ID] = { .type = NLA_U32 }, [NFTA_SET_TIMEOUT] = { .type = NLA_U64 }, [NFTA_SET_GC_INTERVAL] = { .type = NLA_U32 }, + [NFTA_SET_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN }, }; static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { @@ -2482,6 +2484,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; } + if (nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata)) + goto nla_put_failure; + desc = nla_nest_start(skb, NFTA_SET_DESC); if (desc == NULL) goto nla_put_failure; @@ -2691,6 +2696,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, u64 timeout; u32 ktype, dtype, flags, policy, gc_int; struct nft_set_desc desc; + unsigned char *udata; + u16 udlen; int err; if (nla[NFTA_SET_TABLE] == NULL || @@ -2803,12 +2810,16 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (IS_ERR(ops)) return PTR_ERR(ops); + udlen = 0; + if (nla[NFTA_SET_USERDATA]) + udlen = nla_len(nla[NFTA_SET_USERDATA]); + size = 0; if (ops->privsize != NULL) size = ops->privsize(nla); err = -ENOMEM; - set = kzalloc(sizeof(*set) + size, GFP_KERNEL); + set = kzalloc(sizeof(*set) + size + udlen, GFP_KERNEL); if (set == NULL) goto err1; @@ -2817,6 +2828,12 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (err < 0) goto err2; + udata = NULL; + if (udlen) { + udata = set->data + size; + nla_memcpy(udata, nla[NFTA_SET_USERDATA], udlen); + } + INIT_LIST_HEAD(&set->bindings); write_pnet(&set->pnet, net); set->ops = ops; @@ -2827,6 +2844,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, set->flags = flags; set->size = desc.size; set->policy = policy; + set->udlen = udlen; + set->udata = udata; set->timeout = timeout; set->gc_int = gc_int; -- cgit v1.2.3 From 48f66c905a976bf0ff092fc24f08d9addd82a245 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 7 Jan 2016 21:34:24 +0100 Subject: netfilter: nft_ct: add byte/packet counter support If the accounting extension isn't present, we'll return a counter value of 0. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_ct.c | 38 ++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 03c28a402c63..be41ffc128b8 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -757,6 +757,8 @@ enum nft_ct_keys { NFT_CT_PROTO_SRC, NFT_CT_PROTO_DST, NFT_CT_LABELS, + NFT_CT_PKTS, + NFT_CT_BYTES, }; /** diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 8cbca3432f90..6f74109a7ed3 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -30,6 +31,18 @@ struct nft_ct { }; }; +static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c, + enum nft_ct_keys k, + enum ip_conntrack_dir d) +{ + if (d < IP_CT_DIR_MAX) + return k == NFT_CT_BYTES ? atomic64_read(&c[d].bytes) : + atomic64_read(&c[d].packets); + + return nft_ct_get_eval_counter(c, k, IP_CT_DIR_ORIGINAL) + + nft_ct_get_eval_counter(c, k, IP_CT_DIR_REPLY); +} + static void nft_ct_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -114,6 +127,17 @@ static void nft_ct_get_eval(const struct nft_expr *expr, NF_CT_LABELS_MAX_SIZE - size); return; } + case NFT_CT_BYTES: /* fallthrough */ + case NFT_CT_PKTS: { + const struct nf_conn_acct *acct = nf_conn_acct_find(ct); + u64 count = 0; + + if (acct) + count = nft_ct_get_eval_counter(acct->counter, + priv->key, priv->dir); + memcpy(dest, &count, sizeof(count)); + return; + } #endif default: break; @@ -291,6 +315,13 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, return -EINVAL; len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u.all); break; + case NFT_CT_BYTES: + case NFT_CT_PKTS: + /* no direction? return sum of original + reply */ + if (tb[NFTA_CT_DIRECTION] == NULL) + priv->dir = IP_CT_DIR_MAX; + len = sizeof(u64); + break; default: return -EOPNOTSUPP; } @@ -373,6 +404,13 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) case NFT_CT_PROTO_DST: if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) goto nla_put_failure; + break; + case NFT_CT_BYTES: + case NFT_CT_PKTS: + if (priv->dir < IP_CT_DIR_MAX && + nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) + goto nla_put_failure; + break; default: break; } -- cgit v1.2.3 From 1f211a1b929c804100e138c5d3d656992cfd5622 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 7 Jan 2016 22:29:47 +0100 Subject: net, sched: add clsact qdisc This work adds a generalization of the ingress qdisc as a qdisc holding only classifiers. The clsact qdisc works on ingress, but also on egress. In both cases, it's execution happens without taking the qdisc lock, and the main difference for the egress part compared to prior version of [1] is that this can be applied with _any_ underlying real egress qdisc (also classless ones). Besides solving the use-case of [1], that is, allowing for more programmability on assigning skb->priority for the mqprio case that is supported by most popular 10G+ NICs, it also opens up a lot more flexibility for other tc applications. The main work on classification can already be done at clsact egress time if the use-case allows and state stored for later retrieval f.e. again in skb->priority with major/minors (which is checked by most classful qdiscs before consulting tc_classify()) and/or in other skb fields like skb->tc_index for some light-weight post-processing to get to the eventual classid in case of a classful qdisc. Another use case is that the clsact egress part allows to have a central egress counterpart to the ingress classifiers, so that classifiers can easily share state (e.g. in cls_bpf via eBPF maps) for ingress and egress. Currently, default setups like mq + pfifo_fast would require for this to use, for example, prio qdisc instead (to get a tc_classify() run) and to duplicate the egress classifier for each queue. With clsact, it allows for leaving the setup as is, it can additionally assign skb->priority to put the skb in one of pfifo_fast's bands and it can share state with maps. Moreover, we can access the skb's dst entry (f.e. to retrieve tclassid) w/o the need to perform a skb_dst_force() to hold on to it any longer. In lwt case, we can also use this facility to setup dst metadata via cls_bpf (bpf_skb_set_tunnel_key()) without needing a real egress qdisc just for that (case of IFF_NO_QUEUE devices, for example). The realization can be done without any changes to the scheduler core framework. All it takes is that we have two a-priori defined minors/child classes, where we can mux between ingress and egress classifier list (dev->ingress_cl_list and dev->egress_cl_list, latter stored close to dev->_tx to avoid extra cacheline miss for moderate loads). The egress part is a bit similar modelled to handle_ing() and patched to a noop in case the functionality is not used. Both handlers are now called sch_handle_ingress() and sch_handle_egress(), code sharing among the two doesn't seem practical as there are various minor differences in both paths, so that making them conditional in a single handler would rather slow things down. Full compatibility to ingress qdisc is provided as well. Since both piggyback on TC_H_CLSACT, only one of them (ingress/clsact) can exist per netdevice, and thus ingress qdisc specific behaviour can be retained for user space. This means, either a user does 'tc qdisc add dev foo ingress' and configures ingress qdisc as usual, or the 'tc qdisc add dev foo clsact' alternative, where both, ingress and egress classifier can be configured as in the below example. ingress qdisc supports attaching classifier to any minor number whereas clsact has two fixed minors for muxing between the lists, therefore to not break user space setups, they are better done as two separate qdiscs. I decided to extend the sch_ingress module with clsact functionality so that commonly used code can be reused, the module is being aliased with sch_clsact so that it can be auto-loaded properly. Alternative would have been to add a flag when initializing ingress to alter its behaviour plus aliasing to a different name (as it's more than just ingress). However, the first would end up, based on the flag, choosing the new/old behaviour by calling different function implementations to handle each anyway, the latter would require to register ingress qdisc once again under different alias. So, this really begs to provide a minimal, cleaner approach to have Qdisc_ops and Qdisc_class_ops by its own that share callbacks used by both. Example, adding qdisc: # tc qdisc add dev foo clsact # tc qdisc show dev foo qdisc mq 0: root qdisc pfifo_fast 0: parent :1 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc pfifo_fast 0: parent :2 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc pfifo_fast 0: parent :3 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc pfifo_fast 0: parent :4 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc clsact ffff: parent ffff:fff1 Adding filters (deleting, etc works analogous by specifying ingress/egress): # tc filter add dev foo ingress bpf da obj bar.o sec ingress # tc filter add dev foo egress bpf da obj bar.o sec egress # tc filter show dev foo ingress filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 bar.o:[ingress] direct-action # tc filter show dev foo egress filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 bar.o:[egress] direct-action A 'tc filter show dev foo' or 'tc filter show dev foo parent ffff:' will show an empty list for clsact. Either using the parent names (ingress/egress) or specifying the full major/minor will then show the related filter lists. Prior work on a mqprio prequeue() facility [1] was done mainly by John Fastabend. [1] http://patchwork.ozlabs.org/patch/512949/ Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +- include/linux/rtnetlink.h | 5 +++ include/uapi/linux/pkt_sched.h | 4 ++ net/Kconfig | 3 ++ net/core/dev.c | 82 +++++++++++++++++++++++++++++++++++---- net/sched/Kconfig | 14 +++++-- net/sched/cls_bpf.c | 2 +- net/sched/sch_ingress.c | 88 +++++++++++++++++++++++++++++++++++++++++- 8 files changed, 186 insertions(+), 16 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8d8e5ca951b4..2285596e7045 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1739,7 +1739,9 @@ struct net_device { #ifdef CONFIG_XPS struct xps_dev_maps __rcu *xps_maps; #endif - +#ifdef CONFIG_NET_CLS_ACT + struct tcf_proto __rcu *egress_cl_list; +#endif #ifdef CONFIG_NET_SWITCHDEV u32 offload_fwd_mark; #endif diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 4be5048b1fbe..c006cc900c44 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -84,6 +84,11 @@ void net_inc_ingress_queue(void); void net_dec_ingress_queue(void); #endif +#ifdef CONFIG_NET_EGRESS +void net_inc_egress_queue(void); +void net_dec_egress_queue(void); +#endif + extern void rtnetlink_init(void); extern void __rtnl_unlock(void); diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 8d2530daca9f..8cb18b44968e 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -72,6 +72,10 @@ struct tc_estimator { #define TC_H_UNSPEC (0U) #define TC_H_ROOT (0xFFFFFFFFU) #define TC_H_INGRESS (0xFFFFFFF1U) +#define TC_H_CLSACT TC_H_INGRESS + +#define TC_H_MIN_INGRESS 0xFFF2U +#define TC_H_MIN_EGRESS 0xFFF3U /* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */ enum tc_link_layer { diff --git a/net/Kconfig b/net/Kconfig index 11f8c22af34d..174354618f8a 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -48,6 +48,9 @@ config COMPAT_NETLINK_MESSAGES config NET_INGRESS bool +config NET_EGRESS + bool + menu "Networking options" source "net/packet/Kconfig" diff --git a/net/core/dev.c b/net/core/dev.c index 914b4a24c654..0ca95d5d7af0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1676,6 +1676,22 @@ void net_dec_ingress_queue(void) EXPORT_SYMBOL_GPL(net_dec_ingress_queue); #endif +#ifdef CONFIG_NET_EGRESS +static struct static_key egress_needed __read_mostly; + +void net_inc_egress_queue(void) +{ + static_key_slow_inc(&egress_needed); +} +EXPORT_SYMBOL_GPL(net_inc_egress_queue); + +void net_dec_egress_queue(void) +{ + static_key_slow_dec(&egress_needed); +} +EXPORT_SYMBOL_GPL(net_dec_egress_queue); +#endif + static struct static_key netstamp_needed __read_mostly; #ifdef HAVE_JUMP_LABEL /* We are not allowed to call static_key_slow_dec() from irq context @@ -3007,7 +3023,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, bool contended; int rc; - qdisc_pkt_len_init(skb); qdisc_calculate_pkt_len(skb, q); /* * Heuristic to force contended enqueues to serialize on a @@ -3100,6 +3115,49 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(dev_loopback_xmit); +#ifdef CONFIG_NET_EGRESS +static struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +{ + struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list); + struct tcf_result cl_res; + + if (!cl) + return skb; + + /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set + * earlier by the caller. + */ + qdisc_bstats_cpu_update(cl->q, skb); + + switch (tc_classify(skb, cl, &cl_res, false)) { + case TC_ACT_OK: + case TC_ACT_RECLASSIFY: + skb->tc_index = TC_H_MIN(cl_res.classid); + break; + case TC_ACT_SHOT: + qdisc_qstats_cpu_drop(cl->q); + *ret = NET_XMIT_DROP; + goto drop; + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *ret = NET_XMIT_SUCCESS; +drop: + kfree_skb(skb); + return NULL; + case TC_ACT_REDIRECT: + /* No need to push/pop skb's mac_header here on egress! */ + skb_do_redirect(skb); + *ret = NET_XMIT_SUCCESS; + return NULL; + default: + break; + } + + return skb; +} +#endif /* CONFIG_NET_EGRESS */ + static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_XPS @@ -3226,6 +3284,17 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) skb_update_prio(skb); + qdisc_pkt_len_init(skb); +#ifdef CONFIG_NET_CLS_ACT + skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); +# ifdef CONFIG_NET_EGRESS + if (static_key_false(&egress_needed)) { + skb = sch_handle_egress(skb, &rc, dev); + if (!skb) + goto out; + } +# endif +#endif /* If device/qdisc don't need skb->dst, release it right now while * its hot in this cpu cache. */ @@ -3247,9 +3316,6 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); -#ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); -#endif trace_net_dev_queue(skb); if (q->enqueue) { rc = __dev_xmit_skb(skb, q, dev, txq); @@ -3806,9 +3872,9 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev, EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif -static inline struct sk_buff *handle_ing(struct sk_buff *skb, - struct packet_type **pt_prev, - int *ret, struct net_device *orig_dev) +static inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev) { #ifdef CONFIG_NET_CLS_ACT struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list); @@ -4002,7 +4068,7 @@ another_round: skip_taps: #ifdef CONFIG_NET_INGRESS if (static_key_false(&ingress_needed)) { - skb = handle_ing(skb, &pt_prev, &ret, orig_dev); + skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); if (!skb) goto out; diff --git a/net/sched/Kconfig b/net/sched/Kconfig index daa33432b716..82830824fb1f 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -310,15 +310,21 @@ config NET_SCH_PIE If unsure, say N. config NET_SCH_INGRESS - tristate "Ingress Qdisc" + tristate "Ingress/classifier-action Qdisc" depends on NET_CLS_ACT select NET_INGRESS + select NET_EGRESS ---help--- - Say Y here if you want to use classifiers for incoming packets. + Say Y here if you want to use classifiers for incoming and/or outgoing + packets. This qdisc doesn't do anything else besides running classifiers, + which can also have actions attached to them. In case of outgoing packets, + classifiers that this qdisc holds are executed in the transmit path + before real enqueuing to an egress qdisc happens. + If unsure, say Y. - To compile this code as a module, choose M here: the - module will be called sch_ingress. + To compile this code as a module, choose M here: the module will be + called sch_ingress with alias of sch_clsact. config NET_SCH_PLUG tristate "Plug network traffic until release (PLUG)" diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index b3c8bb4aeef5..8dc84300ee79 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -291,7 +291,7 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, prog->bpf_name = name; prog->filter = fp; - if (fp->dst_needed) + if (fp->dst_needed && !(tp->q->flags & TCQ_F_INGRESS)) netif_keep_dst(qdisc_dev(tp->q)); return 0; diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index e7c648fa9dc3..10adbc617905 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -1,4 +1,5 @@ -/* net/sched/sch_ingress.c - Ingress qdisc +/* net/sched/sch_ingress.c - Ingress and clsact qdisc + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -98,17 +99,100 @@ static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .owner = THIS_MODULE, }; +static unsigned long clsact_get(struct Qdisc *sch, u32 classid) +{ + switch (TC_H_MIN(classid)) { + case TC_H_MIN(TC_H_MIN_INGRESS): + case TC_H_MIN(TC_H_MIN_EGRESS): + return TC_H_MIN(classid); + default: + return 0; + } +} + +static unsigned long clsact_bind_filter(struct Qdisc *sch, + unsigned long parent, u32 classid) +{ + return clsact_get(sch, classid); +} + +static struct tcf_proto __rcu **clsact_find_tcf(struct Qdisc *sch, + unsigned long cl) +{ + struct net_device *dev = qdisc_dev(sch); + + switch (cl) { + case TC_H_MIN(TC_H_MIN_INGRESS): + return &dev->ingress_cl_list; + case TC_H_MIN(TC_H_MIN_EGRESS): + return &dev->egress_cl_list; + default: + return NULL; + } +} + +static int clsact_init(struct Qdisc *sch, struct nlattr *opt) +{ + net_inc_ingress_queue(); + net_inc_egress_queue(); + + sch->flags |= TCQ_F_CPUSTATS; + + return 0; +} + +static void clsact_destroy(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + + tcf_destroy_chain(&dev->ingress_cl_list); + tcf_destroy_chain(&dev->egress_cl_list); + + net_dec_ingress_queue(); + net_dec_egress_queue(); +} + +static const struct Qdisc_class_ops clsact_class_ops = { + .leaf = ingress_leaf, + .get = clsact_get, + .put = ingress_put, + .walk = ingress_walk, + .tcf_chain = clsact_find_tcf, + .bind_tcf = clsact_bind_filter, + .unbind_tcf = ingress_put, +}; + +static struct Qdisc_ops clsact_qdisc_ops __read_mostly = { + .cl_ops = &clsact_class_ops, + .id = "clsact", + .init = clsact_init, + .destroy = clsact_destroy, + .dump = ingress_dump, + .owner = THIS_MODULE, +}; + static int __init ingress_module_init(void) { - return register_qdisc(&ingress_qdisc_ops); + int ret; + + ret = register_qdisc(&ingress_qdisc_ops); + if (!ret) { + ret = register_qdisc(&clsact_qdisc_ops); + if (ret) + unregister_qdisc(&ingress_qdisc_ops); + } + + return ret; } static void __exit ingress_module_exit(void) { unregister_qdisc(&ingress_qdisc_ops); + unregister_qdisc(&clsact_qdisc_ops); } module_init(ingress_module_init); module_exit(ingress_module_exit); +MODULE_ALIAS("sch_clsact"); MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 781c53bc5d5628065a46c70f02f5a0450f5842f4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 11 Jan 2016 01:16:38 +0100 Subject: bpf: export helper function flags and reject invalid ones Export flags used by eBPF helper functions through UAPI, so they can be used by programs (instead of them redefining all flags each time or just using the hard-coded values). It also gives a better overview what flags are used where and we can further get rid of the extra macros defined in filter.c. Moreover, reject invalid flags. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 16 ++++++++++++++++ net/core/filter.c | 37 +++++++++++++++++++++++-------------- 2 files changed, 39 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8bed7f1176b8..d94797ce9a5a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -273,6 +273,22 @@ enum bpf_func_id { __BPF_FUNC_MAX_ID, }; +/* All flags used by eBPF helper functions, placed here. */ + +/* BPF_FUNC_skb_store_bytes flags. */ +#define BPF_F_RECOMPUTE_CSUM (1ULL << 0) + +/* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. + * First 4 bits are for passing the header field size. + */ +#define BPF_F_HDR_FIELD_MASK 0xfULL + +/* BPF_FUNC_l4_csum_replace flags. */ +#define BPF_F_PSEUDO_HDR (1ULL << 4) + +/* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ +#define BPF_F_INGRESS (1ULL << 0) + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ diff --git a/net/core/filter.c b/net/core/filter.c index 0db92b5e2cbf..7c55cadc0f38 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1328,8 +1328,7 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) return 0; } -#define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) -#define BPF_LDST_LEN 16U +#define BPF_LDST_LEN 16U static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) { @@ -1340,6 +1339,9 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) char buf[BPF_LDST_LEN]; void *ptr; + if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM))) + return -EINVAL; + /* bpf verifier guarantees that: * 'from' pointer points to bpf program stack * 'len' bytes of it were initialized @@ -1359,7 +1361,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) if (unlikely(!ptr)) return -EFAULT; - if (BPF_RECOMPUTE_CSUM(flags)) + if (flags & BPF_F_RECOMPUTE_CSUM) skb_postpull_rcsum(skb, ptr, len); memcpy(ptr, from, len); @@ -1368,7 +1370,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) /* skb_store_bits cannot return -EFAULT here */ skb_store_bits(skb, offset, ptr, len); - if (BPF_RECOMPUTE_CSUM(flags)) + if (flags & BPF_F_RECOMPUTE_CSUM) skb_postpush_rcsum(skb, ptr, len); return 0; @@ -1415,15 +1417,14 @@ const struct bpf_func_proto bpf_skb_load_bytes_proto = { .arg4_type = ARG_CONST_STACK_SIZE, }; -#define BPF_HEADER_FIELD_SIZE(flags) ((flags) & 0x0f) -#define BPF_IS_PSEUDO_HEADER(flags) ((flags) & 0x10) - static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; int offset = (int) r2; __sum16 sum, *ptr; + if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) + return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; @@ -1435,7 +1436,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) if (unlikely(!ptr)) return -EFAULT; - switch (BPF_HEADER_FIELD_SIZE(flags)) { + switch (flags & BPF_F_HDR_FIELD_MASK) { case 2: csum_replace2(ptr, from, to); break; @@ -1467,10 +1468,12 @@ const struct bpf_func_proto bpf_l3_csum_replace_proto = { static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; - bool is_pseudo = !!BPF_IS_PSEUDO_HEADER(flags); + bool is_pseudo = flags & BPF_F_PSEUDO_HDR; int offset = (int) r2; __sum16 sum, *ptr; + if (unlikely(flags & ~(BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) + return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; @@ -1482,7 +1485,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) if (unlikely(!ptr)) return -EFAULT; - switch (BPF_HEADER_FIELD_SIZE(flags)) { + switch (flags & BPF_F_HDR_FIELD_MASK) { case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); break; @@ -1511,13 +1514,14 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; -#define BPF_IS_REDIRECT_INGRESS(flags) ((flags) & 1) - static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2; struct net_device *dev; + if (unlikely(flags & ~(BPF_F_INGRESS))) + return -EINVAL; + dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); if (unlikely(!dev)) return -EINVAL; @@ -1526,7 +1530,7 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) if (unlikely(!skb2)) return -ENOMEM; - if (BPF_IS_REDIRECT_INGRESS(flags)) { + if (flags & BPF_F_INGRESS) { if (skb_at_tc_ingress(skb2)) skb_postpush_rcsum(skb2, skb_mac_header(skb2), skb2->mac_len); @@ -1553,12 +1557,17 @@ struct redirect_info { }; static DEFINE_PER_CPU(struct redirect_info, redirect_info); + static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); + if (unlikely(flags & ~(BPF_F_INGRESS))) + return TC_ACT_SHOT; + ri->ifindex = ifindex; ri->flags = flags; + return TC_ACT_REDIRECT; } @@ -1574,7 +1583,7 @@ int skb_do_redirect(struct sk_buff *skb) return -EINVAL; } - if (BPF_IS_REDIRECT_INGRESS(ri->flags)) { + if (ri->flags & BPF_F_INGRESS) { if (skb_at_tc_ingress(skb)) skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len); -- cgit v1.2.3 From c6c33454072fc9fe961e2b25f22a619e4fa98838 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 11 Jan 2016 01:16:39 +0100 Subject: bpf: support ipv6 for bpf_skb_{set,get}_tunnel_key After IPv6 support has recently been added to metadata dst and related encaps, add support for populating/reading it from an eBPF program. Commit d3aa45ce6b ("bpf: add helpers to access tunnel metadata") started with initial IPv4-only support back then (due to IPv6 metadata support not being available yet). To stay compatible with older programs, we need to test for the passed structure size. Also TOS and TTL support from the ip_tunnel_info key has been added. Tested with vxlan devs in collect meta data mode with IPv4, IPv6 and in compat mode over different network namespaces. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 10 ++++++- net/core/filter.c | 69 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 71 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d94797ce9a5a..aa6f8571de13 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -289,6 +289,9 @@ enum bpf_func_id { /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ #define BPF_F_INGRESS (1ULL << 0) +/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ +#define BPF_F_TUNINFO_IPV6 (1ULL << 0) + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ @@ -312,7 +315,12 @@ struct __sk_buff { struct bpf_tunnel_key { __u32 tunnel_id; - __u32 remote_ipv4; + union { + __u32 remote_ipv4; + __u32 remote_ipv6[4]; + }; + __u8 tunnel_tos; + __u8 tunnel_ttl; }; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/net/core/filter.c b/net/core/filter.c index 7c55cadc0f38..77cdfb455e7f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1680,19 +1680,49 @@ bool bpf_helper_changes_skb_data(void *func) return false; } +static unsigned short bpf_tunnel_key_af(u64 flags) +{ + return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; +} + static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1; struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; - struct ip_tunnel_info *info = skb_tunnel_info(skb); + const struct ip_tunnel_info *info = skb_tunnel_info(skb); + u8 compat[sizeof(struct bpf_tunnel_key)]; - if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info)) - return -EINVAL; - if (ip_tunnel_info_af(info) != AF_INET) + if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) return -EINVAL; + if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) + return -EPROTO; + if (unlikely(size != sizeof(struct bpf_tunnel_key))) { + switch (size) { + case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): + /* Fixup deprecated structure layouts here, so we have + * a common path later on. + */ + if (ip_tunnel_info_af(info) != AF_INET) + return -EINVAL; + to = (struct bpf_tunnel_key *)compat; + break; + default: + return -EINVAL; + } + } to->tunnel_id = be64_to_cpu(info->key.tun_id); - to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + to->tunnel_tos = info->key.tos; + to->tunnel_ttl = info->key.ttl; + + if (flags & BPF_F_TUNINFO_IPV6) + memcpy(to->remote_ipv6, &info->key.u.ipv6.src, + sizeof(to->remote_ipv6)); + else + to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + + if (unlikely(size != sizeof(struct bpf_tunnel_key))) + memcpy((void *)(long) r2, to, size); return 0; } @@ -1714,10 +1744,25 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) struct sk_buff *skb = (struct sk_buff *) (long) r1; struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2; struct metadata_dst *md = this_cpu_ptr(md_dst); + u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; - if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags)) + if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6))) return -EINVAL; + if (unlikely(size != sizeof(struct bpf_tunnel_key))) { + switch (size) { + case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): + /* Fixup deprecated structure layouts here, so we have + * a common path later on. + */ + memcpy(compat, from, size); + memset(compat + size, 0, sizeof(compat) - size); + from = (struct bpf_tunnel_key *)compat; + break; + default: + return -EINVAL; + } + } skb_dst_drop(skb); dst_hold((struct dst_entry *) md); @@ -1725,9 +1770,19 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info = &md->u.tun_info; info->mode = IP_TUNNEL_INFO_TX; + info->key.tun_flags = TUNNEL_KEY; info->key.tun_id = cpu_to_be64(from->tunnel_id); - info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); + info->key.tos = from->tunnel_tos; + info->key.ttl = from->tunnel_ttl; + + if (flags & BPF_F_TUNINFO_IPV6) { + info->mode |= IP_TUNNEL_INFO_IPV6; + memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, + sizeof(from->remote_ipv6)); + } else { + info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); + } return 0; } -- cgit v1.2.3