summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorArnaldo Carvalho de Melo <acme@redhat.com>2009-10-12 23:40:10 -0700
committerDavid S. Miller <davem@davemloft.net>2009-10-12 23:40:10 -0700
commita2e2725541fad72416326798c2d7fa4dafb7d337 (patch)
tree6174be11da607e83eb8efb3775114ad4d6e0ca3a /net
parentc05e85a06e376f6b6d59e71e5333d707e956d78b (diff)
downloadblackbird-op-linux-a2e2725541fad72416326798c2d7fa4dafb7d337.tar.gz
blackbird-op-linux-a2e2725541fad72416326798c2d7fa4dafb7d337.zip
net: Introduce recvmmsg socket syscall
Meaning receive multiple messages, reducing the number of syscalls and net stack entry/exit operations. Next patches will introduce mechanisms where protocols that want to optimize this operation will provide an unlocked_recvmsg operation. This takes into account comments made by: . Paul Moore: sock_recvmsg is called only for the first datagram, sock_recvmsg_nosec is used for the rest. . Caitlin Bestler: recvmmsg now has a struct timespec timeout, that works in the same fashion as the ppoll one. If the underlying protocol returns a datagram with MSG_OOB set, this will make recvmmsg return right away with as many datagrams (+ the OOB one) it has received so far. . Rémi Denis-Courmont & Steven Whitehouse: If we receive N < vlen datagrams and then recvmsg returns an error, recvmmsg will return the successfully received datagrams, store the error and return it in the next call. This paves the way for a subsequent optimization, sk_prot->unlocked_recvmsg, where we will be able to acquire the lock only at batch start and end, not at every underlying recvmsg call. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/compat.c33
-rw-r--r--net/socket.c225
2 files changed, 213 insertions, 45 deletions
diff --git a/net/compat.c b/net/compat.c
index a407c3addbae..e13f5256fd20 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -727,10 +727,10 @@ EXPORT_SYMBOL(compat_mc_getsockopt);
/* Argument list sizes for compat_sys_socketcall */
#define AL(x) ((x) * sizeof(u32))
-static unsigned char nas[19]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
+static unsigned char nas[20]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
- AL(4)};
+ AL(4),AL(5)};
#undef AL
asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned flags)
@@ -755,13 +755,36 @@ asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, size_t len,
return sys_recvfrom(fd, buf, len, flags | MSG_CMSG_COMPAT, addr, addrlen);
}
+asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg,
+ unsigned vlen, unsigned int flags,
+ struct timespec __user *timeout)
+{
+ int datagrams;
+ struct timespec ktspec;
+ struct compat_timespec __user *utspec =
+ (struct compat_timespec __user *)timeout;
+
+ if (get_user(ktspec.tv_sec, &utspec->tv_sec) ||
+ get_user(ktspec.tv_nsec, &utspec->tv_nsec))
+ return -EFAULT;
+
+ datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+ flags | MSG_CMSG_COMPAT, &ktspec);
+ if (datagrams > 0 &&
+ (put_user(ktspec.tv_sec, &utspec->tv_sec) ||
+ put_user(ktspec.tv_nsec, &utspec->tv_nsec)))
+ datagrams = -EFAULT;
+
+ return datagrams;
+}
+
asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
{
int ret;
u32 a[6];
u32 a0, a1;
- if (call < SYS_SOCKET || call > SYS_ACCEPT4)
+ if (call < SYS_SOCKET || call > SYS_RECVMMSG)
return -EINVAL;
if (copy_from_user(a, args, nas[call]))
return -EFAULT;
@@ -823,6 +846,10 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
case SYS_RECVMSG:
ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
break;
+ case SYS_RECVMMSG:
+ ret = compat_sys_recvmmsg(a0, compat_ptr(a1), a[2], a[3],
+ compat_ptr(a[4]));
+ break;
case SYS_ACCEPT4:
ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]);
break;
diff --git a/net/socket.c b/net/socket.c
index 807935693846..9dff31c9b799 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -683,10 +683,9 @@ void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
}
EXPORT_SYMBOL_GPL(sock_recv_ts_and_drops);
-static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t size, int flags)
+static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock,
+ struct msghdr *msg, size_t size, int flags)
{
- int err;
struct sock_iocb *si = kiocb_to_siocb(iocb);
si->sock = sock;
@@ -695,13 +694,17 @@ static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
si->size = size;
si->flags = flags;
- err = security_socket_recvmsg(sock, msg, size, flags);
- if (err)
- return err;
-
return sock->ops->recvmsg(iocb, sock, msg, size, flags);
}
+static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
+ struct msghdr *msg, size_t size, int flags)
+{
+ int err = security_socket_recvmsg(sock, msg, size, flags);
+
+ return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags);
+}
+
int sock_recvmsg(struct socket *sock, struct msghdr *msg,
size_t size, int flags)
{
@@ -717,6 +720,21 @@ int sock_recvmsg(struct socket *sock, struct msghdr *msg,
return ret;
}
+static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct kiocb iocb;
+ struct sock_iocb siocb;
+ int ret;
+
+ init_sync_kiocb(&iocb, NULL);
+ iocb.private = &siocb;
+ ret = __sock_recvmsg_nosec(&iocb, sock, msg, size, flags);
+ if (-EIOCBQUEUED == ret)
+ ret = wait_on_sync_kiocb(&iocb);
+ return ret;
+}
+
int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
struct kvec *vec, size_t num, size_t size, int flags)
{
@@ -1983,22 +2001,15 @@ out:
return err;
}
-/*
- * BSD recvmsg interface
- */
-
-SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
- unsigned int, flags)
+static int __sys_recvmsg(struct socket *sock, struct msghdr __user *msg,
+ struct msghdr *msg_sys, unsigned flags, int nosec)
{
struct compat_msghdr __user *msg_compat =
(struct compat_msghdr __user *)msg;
- struct socket *sock;
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
- struct msghdr msg_sys;
unsigned long cmsg_ptr;
int err, iov_size, total_len, len;
- int fput_needed;
/* kernel mode address */
struct sockaddr_storage addr;
@@ -2008,27 +2019,23 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
int __user *uaddr_len;
if (MSG_CMSG_COMPAT & flags) {
- if (get_compat_msghdr(&msg_sys, msg_compat))
+ if (get_compat_msghdr(msg_sys, msg_compat))
return -EFAULT;
}
- else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr)))
+ else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr)))
return -EFAULT;
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (!sock)
- goto out;
-
err = -EMSGSIZE;
- if (msg_sys.msg_iovlen > UIO_MAXIOV)
- goto out_put;
+ if (msg_sys->msg_iovlen > UIO_MAXIOV)
+ goto out;
/* Check whether to allocate the iovec area */
err = -ENOMEM;
- iov_size = msg_sys.msg_iovlen * sizeof(struct iovec);
- if (msg_sys.msg_iovlen > UIO_FASTIOV) {
+ iov_size = msg_sys->msg_iovlen * sizeof(struct iovec);
+ if (msg_sys->msg_iovlen > UIO_FASTIOV) {
iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
if (!iov)
- goto out_put;
+ goto out;
}
/*
@@ -2036,46 +2043,47 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
* kernel msghdr to use the kernel address space)
*/
- uaddr = (__force void __user *)msg_sys.msg_name;
+ uaddr = (__force void __user *)msg_sys->msg_name;
uaddr_len = COMPAT_NAMELEN(msg);
if (MSG_CMSG_COMPAT & flags) {
- err = verify_compat_iovec(&msg_sys, iov,
+ err = verify_compat_iovec(msg_sys, iov,
(struct sockaddr *)&addr,
VERIFY_WRITE);
} else
- err = verify_iovec(&msg_sys, iov,
+ err = verify_iovec(msg_sys, iov,
(struct sockaddr *)&addr,
VERIFY_WRITE);
if (err < 0)
goto out_freeiov;
total_len = err;
- cmsg_ptr = (unsigned long)msg_sys.msg_control;
- msg_sys.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
+ cmsg_ptr = (unsigned long)msg_sys->msg_control;
+ msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
if (sock->file->f_flags & O_NONBLOCK)
flags |= MSG_DONTWAIT;
- err = sock_recvmsg(sock, &msg_sys, total_len, flags);
+ err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys,
+ total_len, flags);
if (err < 0)
goto out_freeiov;
len = err;
if (uaddr != NULL) {
err = move_addr_to_user((struct sockaddr *)&addr,
- msg_sys.msg_namelen, uaddr,
+ msg_sys->msg_namelen, uaddr,
uaddr_len);
if (err < 0)
goto out_freeiov;
}
- err = __put_user((msg_sys.msg_flags & ~MSG_CMSG_COMPAT),
+ err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT),
COMPAT_FLAGS(msg));
if (err)
goto out_freeiov;
if (MSG_CMSG_COMPAT & flags)
- err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr,
+ err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
&msg_compat->msg_controllen);
else
- err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr,
+ err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
&msg->msg_controllen);
if (err)
goto out_freeiov;
@@ -2084,21 +2092,150 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
out_freeiov:
if (iov != iovstack)
sock_kfree_s(sock->sk, iov, iov_size);
-out_put:
+out:
+ return err;
+}
+
+/*
+ * BSD recvmsg interface
+ */
+
+SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
+ unsigned int, flags)
+{
+ int fput_needed, err;
+ struct msghdr msg_sys;
+ struct socket *sock = sockfd_lookup_light(fd, &err, &fput_needed);
+
+ if (!sock)
+ goto out;
+
+ err = __sys_recvmsg(sock, msg, &msg_sys, flags, 0);
+
fput_light(sock->file, fput_needed);
out:
return err;
}
-#ifdef __ARCH_WANT_SYS_SOCKETCALL
+/*
+ * Linux recvmmsg interface
+ */
+
+int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
+ unsigned int flags, struct timespec *timeout)
+{
+ int fput_needed, err, datagrams;
+ struct socket *sock;
+ struct mmsghdr __user *entry;
+ struct msghdr msg_sys;
+ struct timespec end_time;
+
+ if (timeout &&
+ poll_select_set_timeout(&end_time, timeout->tv_sec,
+ timeout->tv_nsec))
+ return -EINVAL;
+
+ datagrams = 0;
+
+ sock = sockfd_lookup_light(fd, &err, &fput_needed);
+ if (!sock)
+ return err;
+
+ err = sock_error(sock->sk);
+ if (err)
+ goto out_put;
+
+ entry = mmsg;
+
+ while (datagrams < vlen) {
+ /*
+ * No need to ask LSM for more than the first datagram.
+ */
+ err = __sys_recvmsg(sock, (struct msghdr __user *)entry,
+ &msg_sys, flags, datagrams);
+ if (err < 0)
+ break;
+ err = put_user(err, &entry->msg_len);
+ if (err)
+ break;
+ ++entry;
+ ++datagrams;
+
+ if (timeout) {
+ ktime_get_ts(timeout);
+ *timeout = timespec_sub(end_time, *timeout);
+ if (timeout->tv_sec < 0) {
+ timeout->tv_sec = timeout->tv_nsec = 0;
+ break;
+ }
+
+ /* Timeout, return less than vlen datagrams */
+ if (timeout->tv_nsec == 0 && timeout->tv_sec == 0)
+ break;
+ }
+
+ /* Out of band data, return right away */
+ if (msg_sys.msg_flags & MSG_OOB)
+ break;
+ }
+
+out_put:
+ fput_light(sock->file, fput_needed);
+ if (err == 0)
+ return datagrams;
+
+ if (datagrams != 0) {
+ /*
+ * We may return less entries than requested (vlen) if the
+ * sock is non block and there aren't enough datagrams...
+ */
+ if (err != -EAGAIN) {
+ /*
+ * ... or if recvmsg returns an error after we
+ * received some datagrams, where we record the
+ * error to return on the next call or if the
+ * app asks about it using getsockopt(SO_ERROR).
+ */
+ sock->sk->sk_err = -err;
+ }
+
+ return datagrams;
+ }
+
+ return err;
+}
+
+SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
+ unsigned int, vlen, unsigned int, flags,
+ struct timespec __user *, timeout)
+{
+ int datagrams;
+ struct timespec timeout_sys;
+
+ if (!timeout)
+ return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL);
+
+ if (copy_from_user(&timeout_sys, timeout, sizeof(timeout_sys)))
+ return -EFAULT;
+
+ datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);
+
+ if (datagrams > 0 &&
+ copy_to_user(timeout, &timeout_sys, sizeof(timeout_sys)))
+ datagrams = -EFAULT;
+
+ return datagrams;
+}
+
+#ifdef __ARCH_WANT_SYS_SOCKETCALL
/* Argument list sizes for sys_socketcall */
#define AL(x) ((x) * sizeof(unsigned long))
-static const unsigned char nargs[19]={
+static const unsigned char nargs[20] = {
AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
- AL(4)
+ AL(4),AL(5)
};
#undef AL
@@ -2118,7 +2255,7 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
int err;
unsigned int len;
- if (call < 1 || call > SYS_ACCEPT4)
+ if (call < 1 || call > SYS_RECVMMSG)
return -EINVAL;
len = nargs[call];
@@ -2196,6 +2333,10 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
case SYS_RECVMSG:
err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
break;
+ case SYS_RECVMMSG:
+ err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3],
+ (struct timespec __user *)a[4]);
+ break;
case SYS_ACCEPT4:
err = sys_accept4(a0, (struct sockaddr __user *)a1,
(int __user *)a[2], a[3]);
OpenPOWER on IntegriCloud