From 581319c58600b54612c417aff32ae9bbd79f4cdb Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 9 Mar 2017 13:54:08 +0100 Subject: net/socket: use per af lockdep classes for sk queues Currently the sock queue's spin locks get their lockdep classes by the default init_spin_lock() initializer: all socket families get - usually, see below - a single class for rx, another specific class for tx, etc. This can lead to false positive lockdep splat, as reported by Andrey. Moreover there are two separate initialization points for the sock queues, one in sk_clone_lock() and one in sock_init_data(), so that e.g. the rx queue lock can get one of two possible, different classes, depending on the socket being cloned or not. This change tries to address the above, setting explicitly a per address family lockdep class for each queue's spinlock. Also, move the duplicated initialization code to a single location. v1 -> v2: - renamed the init helper rfc -> v1: - no changes, tested with several different workload Suggested-by: Cong Wang Signed-off-by: Paolo Abeni Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 78 insertions(+), 18 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index f6fd79f33097..768aedf238f5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -258,12 +258,66 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" , "clock-AF_QIPCRTR", "clock-AF_SMC" , "clock-AF_MAX" }; +static const char *const af_family_rlock_key_strings[AF_MAX+1] = { + "rlock-AF_UNSPEC", "rlock-AF_UNIX" , "rlock-AF_INET" , + "rlock-AF_AX25" , "rlock-AF_IPX" , "rlock-AF_APPLETALK", + "rlock-AF_NETROM", "rlock-AF_BRIDGE" , "rlock-AF_ATMPVC" , + "rlock-AF_X25" , "rlock-AF_INET6" , "rlock-AF_ROSE" , + "rlock-AF_DECnet", "rlock-AF_NETBEUI" , "rlock-AF_SECURITY" , + "rlock-AF_KEY" , "rlock-AF_NETLINK" , "rlock-AF_PACKET" , + "rlock-AF_ASH" , "rlock-AF_ECONET" , "rlock-AF_ATMSVC" , + "rlock-AF_RDS" , "rlock-AF_SNA" , "rlock-AF_IRDA" , + "rlock-AF_PPPOX" , "rlock-AF_WANPIPE" , "rlock-AF_LLC" , + "rlock-27" , "rlock-28" , "rlock-AF_CAN" , + "rlock-AF_TIPC" , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV" , + "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" , + "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" , + "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" , + "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_MAX" +}; +static const char *const af_family_wlock_key_strings[AF_MAX+1] = { + "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" , + "wlock-AF_AX25" , "wlock-AF_IPX" , "wlock-AF_APPLETALK", + "wlock-AF_NETROM", "wlock-AF_BRIDGE" , "wlock-AF_ATMPVC" , + "wlock-AF_X25" , "wlock-AF_INET6" , "wlock-AF_ROSE" , + "wlock-AF_DECnet", "wlock-AF_NETBEUI" , "wlock-AF_SECURITY" , + "wlock-AF_KEY" , "wlock-AF_NETLINK" , "wlock-AF_PACKET" , + "wlock-AF_ASH" , "wlock-AF_ECONET" , "wlock-AF_ATMSVC" , + "wlock-AF_RDS" , "wlock-AF_SNA" , "wlock-AF_IRDA" , + "wlock-AF_PPPOX" , "wlock-AF_WANPIPE" , "wlock-AF_LLC" , + "wlock-27" , "wlock-28" , "wlock-AF_CAN" , + "wlock-AF_TIPC" , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV" , + "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" , + "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" , + "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" , + "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_MAX" +}; +static const char *const af_family_elock_key_strings[AF_MAX+1] = { + "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" , + "elock-AF_AX25" , "elock-AF_IPX" , "elock-AF_APPLETALK", + "elock-AF_NETROM", "elock-AF_BRIDGE" , "elock-AF_ATMPVC" , + "elock-AF_X25" , "elock-AF_INET6" , "elock-AF_ROSE" , + "elock-AF_DECnet", "elock-AF_NETBEUI" , "elock-AF_SECURITY" , + "elock-AF_KEY" , "elock-AF_NETLINK" , "elock-AF_PACKET" , + "elock-AF_ASH" , "elock-AF_ECONET" , "elock-AF_ATMSVC" , + "elock-AF_RDS" , "elock-AF_SNA" , "elock-AF_IRDA" , + "elock-AF_PPPOX" , "elock-AF_WANPIPE" , "elock-AF_LLC" , + "elock-27" , "elock-28" , "elock-AF_CAN" , + "elock-AF_TIPC" , "elock-AF_BLUETOOTH", "elock-AF_IUCV" , + "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" , + "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" , + "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" , + "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_MAX" +}; /* - * sk_callback_lock locking rules are per-address-family, + * sk_callback_lock and sk queues locking rules are per-address-family, * so split the lock classes by using a per-AF key: */ static struct lock_class_key af_callback_keys[AF_MAX]; +static struct lock_class_key af_rlock_keys[AF_MAX]; +static struct lock_class_key af_wlock_keys[AF_MAX]; +static struct lock_class_key af_elock_keys[AF_MAX]; /* Take into consideration the size of the struct sk_buff overhead in the * determination of these values, since that is non-constant across @@ -1478,6 +1532,27 @@ void sk_free(struct sock *sk) } EXPORT_SYMBOL(sk_free); +static void sk_init_common(struct sock *sk) +{ + skb_queue_head_init(&sk->sk_receive_queue); + skb_queue_head_init(&sk->sk_write_queue); + skb_queue_head_init(&sk->sk_error_queue); + + rwlock_init(&sk->sk_callback_lock); + lockdep_set_class_and_name(&sk->sk_receive_queue.lock, + af_rlock_keys + sk->sk_family, + af_family_rlock_key_strings[sk->sk_family]); + lockdep_set_class_and_name(&sk->sk_write_queue.lock, + af_wlock_keys + sk->sk_family, + af_family_wlock_key_strings[sk->sk_family]); + lockdep_set_class_and_name(&sk->sk_error_queue.lock, + af_elock_keys + sk->sk_family, + af_family_elock_key_strings[sk->sk_family]); + lockdep_set_class_and_name(&sk->sk_callback_lock, + af_callback_keys + sk->sk_family, + af_family_clock_key_strings[sk->sk_family]); +} + /** * sk_clone_lock - clone a socket, and lock its clone * @sk: the socket to clone @@ -1511,13 +1586,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ atomic_set(&newsk->sk_wmem_alloc, 1); atomic_set(&newsk->sk_omem_alloc, 0); - skb_queue_head_init(&newsk->sk_receive_queue); - skb_queue_head_init(&newsk->sk_write_queue); - - rwlock_init(&newsk->sk_callback_lock); - lockdep_set_class_and_name(&newsk->sk_callback_lock, - af_callback_keys + newsk->sk_family, - af_family_clock_key_strings[newsk->sk_family]); + sk_init_common(newsk); newsk->sk_dst_cache = NULL; newsk->sk_dst_pending_confirm = 0; @@ -1528,7 +1597,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; sock_reset_flag(newsk, SOCK_DONE); - skb_queue_head_init(&newsk->sk_error_queue); filter = rcu_dereference_protected(newsk->sk_filter, 1); if (filter != NULL) @@ -2454,10 +2522,7 @@ EXPORT_SYMBOL(sk_stop_timer); void sock_init_data(struct socket *sock, struct sock *sk) { - skb_queue_head_init(&sk->sk_receive_queue); - skb_queue_head_init(&sk->sk_write_queue); - skb_queue_head_init(&sk->sk_error_queue); - + sk_init_common(sk); sk->sk_send_head = NULL; init_timer(&sk->sk_timer); @@ -2480,11 +2545,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); } - rwlock_init(&sk->sk_callback_lock); - lockdep_set_class_and_name(&sk->sk_callback_lock, - af_callback_keys + sk->sk_family, - af_family_clock_key_strings[sk->sk_family]); - sk->sk_state_change = sock_def_wakeup; sk->sk_data_ready = sock_def_readable; sk->sk_write_space = sock_def_write_space; -- cgit v1.2.1 From a2d133b1d465016d0d97560b11f54ba0ace56d3e Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Mon, 20 Mar 2017 15:22:03 -0400 Subject: sock: introduce SO_MEMINFO getsockopt Allows reading of SK_MEMINFO_VARS via socket option. This way an application can get all meminfo related information in single socket option call instead of multiple calls. Adds helper function, sk_get_meminfo(), and uses that for both getsockopt and sock_diag_put_meminfo(). Suggested by Eric Dumazet. Signed-off-by: Josh Hunt Reviewed-by: Jason Baron Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index a83731c36761..f8c0373a3a74 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1313,6 +1313,21 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_incoming_cpu; break; + case SO_MEMINFO: + { + u32 meminfo[SK_MEMINFO_VARS]; + + if (get_user(len, optlen)) + return -EFAULT; + + sk_get_meminfo(sk, meminfo); + + len = min_t(unsigned int, len, sizeof(meminfo)); + if (copy_to_user(optval, &meminfo, len)) + return -EFAULT; + + goto lenout; + } default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -2861,6 +2876,21 @@ void sk_common_release(struct sock *sk) } EXPORT_SYMBOL(sk_common_release); +void sk_get_meminfo(const struct sock *sk, u32 *mem) +{ + memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); + + mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); + mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; + mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); + mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; + mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; + mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; + mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); + mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); +} + #ifdef CONFIG_PROC_FS #define PROTO_INUSE_NR 64 /* should be enough for the first time */ struct prot_inuse { -- cgit v1.2.1 From 7db6b048da3b9f84fe1d22fb29ff7e7c2ec6c0e5 Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Fri, 24 Mar 2017 10:08:24 -0700 Subject: net: Commonize busy polling code to focus on napi_id instead of socket Move the core functionality in sk_busy_loop() to napi_busy_loop() and make it independent of sk. This enables re-using this function in epoll busy loop implementation. Signed-off-by: Sridhar Samudrala Signed-off-by: Alexander Duyck Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 1b9030ee6f4b..4b762f2a3552 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3237,3 +3237,14 @@ static int __init proto_init(void) subsys_initcall(proto_init); #endif /* PROC_FS */ + +#ifdef CONFIG_NET_RX_BUSY_POLL +bool sk_busy_loop_end(void *p, unsigned long start_time) +{ + struct sock *sk = p; + + return !skb_queue_empty(&sk->sk_receive_queue) || + sk_busy_loop_timeout(sk, start_time); +} +EXPORT_SYMBOL(sk_busy_loop_end); +#endif /* CONFIG_NET_RX_BUSY_POLL */ -- cgit v1.2.1 From 6d4339028b350efbf87c61e6d9e113e5373545c9 Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Fri, 24 Mar 2017 10:08:36 -0700 Subject: net: Introduce SO_INCOMING_NAPI_ID This socket option returns the NAPI ID associated with the queue on which the last frame is received. This information can be used by the apps to split the incoming flows among the threads based on the Rx queue on which they are received. If the NAPI ID actually represents a sender_cpu then the value is ignored and 0 is returned. Signed-off-by: Sridhar Samudrala Signed-off-by: Alexander Duyck Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 4b762f2a3552..1a58a9dc6888 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1328,6 +1328,18 @@ int sock_getsockopt(struct socket *sock, int level, int optname, goto lenout; } + +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_INCOMING_NAPI_ID: + v.val = READ_ONCE(sk->sk_napi_id); + + /* aggregate non-NAPI IDs down to 0 */ + if (v.val < MIN_NAPI_ID) + v.val = 0; + + break; +#endif + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). -- cgit v1.2.1 From 6c7c98bad4883a4a8710c96b2b44de482865eb6e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 30 Mar 2017 14:03:06 +0200 Subject: sock: avoid dirtying sk_stamp, if possible sock_recv_ts_and_drops() unconditionally set sk->sk_stamp for every packet, even if the SOCK_TIMESTAMP flag is not set in the related socket. If selinux is enabled, this cause a cache miss for every packet since sk->sk_stamp and sk->sk_security share the same cacheline. With this change sk_stamp is set only if the SOCK_TIMESTAMP flag is set, and is cleared for the first packet, so that the user perceived behavior is unchanged. This gives up to 5% speed-up under udp-flood with small packets. Signed-off-by: Paolo Abeni Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 1a58a9dc6888..392f9b6f96e2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2613,7 +2613,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; - sk->sk_stamp = ktime_set(-1L, 0); + sk->sk_stamp = SK_DEFAULT_STAMP; #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; -- cgit v1.2.1 From 5daab9db7b65df87da26fd8cfa695fb9546a1ddb Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 5 Apr 2017 19:00:55 -0700 Subject: New getsockopt option to get socket cookie Introduce a new getsockopt operation to retrieve the socket cookie for a specific socket based on the socket fd. It returns a unique non-decreasing cookie for each socket. Tested: https://android-review.googlesource.com/#/c/358163/ Acked-by: Willem de Bruijn Signed-off-by: Chenbo Feng Signed-off-by: David S. Miller --- net/core/sock.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 392f9b6f96e2..a06bb7a2a689 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1083,6 +1083,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, union { int val; + u64 val64; struct linger ling; struct timeval tm; } v; @@ -1340,6 +1341,13 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; #endif + case SO_COOKIE: + lv = sizeof(u64); + if (len < lv) + return -EINVAL; + v.val64 = sock_gen_cookie(sk); + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). -- cgit v1.2.1