diff options
Diffstat (limited to 'ipc')
| -rw-r--r-- | ipc/mqueue.c | 130 | ||||
| -rw-r--r-- | ipc/msg.c | 62 | ||||
| -rw-r--r-- | ipc/sem.c | 69 | ||||
| -rw-r--r-- | ipc/syscall.c | 2 | ||||
| -rw-r--r-- | ipc/util.c | 16 | ||||
| -rw-r--r-- | ipc/util.h | 25 |
6 files changed, 197 insertions, 107 deletions
diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 7a5a8edc3de3..49a05ba3000d 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -63,6 +63,66 @@ struct posix_msg_tree_node { int priority; }; +/* + * Locking: + * + * Accesses to a message queue are synchronized by acquiring info->lock. + * + * There are two notable exceptions: + * - The actual wakeup of a sleeping task is performed using the wake_q + * framework. info->lock is already released when wake_up_q is called. + * - The exit codepaths after sleeping check ext_wait_queue->state without + * any locks. If it is STATE_READY, then the syscall is completed without + * acquiring info->lock. + * + * MQ_BARRIER: + * To achieve proper release/acquire memory barrier pairing, the state is set to + * STATE_READY with smp_store_release(), and it is read with READ_ONCE followed + * by smp_acquire__after_ctrl_dep(). In addition, wake_q_add_safe() is used. + * + * This prevents the following races: + * + * 1) With the simple wake_q_add(), the task could be gone already before + * the increase of the reference happens + * Thread A + * Thread B + * WRITE_ONCE(wait.state, STATE_NONE); + * schedule_hrtimeout() + * wake_q_add(A) + * if (cmpxchg()) // success + * ->state = STATE_READY (reordered) + * <timeout returns> + * if (wait.state == STATE_READY) return; + * sysret to user space + * sys_exit() + * get_task_struct() // UaF + * + * Solution: Use wake_q_add_safe() and perform the get_task_struct() before + * the smp_store_release() that does ->state = STATE_READY. + * + * 2) Without proper _release/_acquire barriers, the woken up task + * could read stale data + * + * Thread A + * Thread B + * do_mq_timedreceive + * WRITE_ONCE(wait.state, STATE_NONE); + * schedule_hrtimeout() + * state = STATE_READY; + * <timeout returns> + * if (wait.state == STATE_READY) return; + * msg_ptr = wait.msg; // Access to stale data! + * receiver->msg = message; (reordered) + * + * Solution: use _release and _acquire barriers. + * + * 3) There is intentionally no barrier when setting current->state + * to TASK_INTERRUPTIBLE: spin_unlock(&info->lock) provides the + * release memory barrier, and the wakeup is triggered when holding + * info->lock, i.e. spin_lock(&info->lock) provided a pairing + * acquire memory barrier. + */ + struct ext_wait_queue { /* queue of sleeping tasks */ struct task_struct *task; struct list_head list; @@ -364,8 +424,7 @@ static int mqueue_get_tree(struct fs_context *fc) { struct mqueue_fs_context *ctx = fc->fs_private; - fc->s_fs_info = ctx->ipc_ns; - return vfs_get_super(fc, vfs_get_keyed_super, mqueue_fill_super); + return get_tree_keyed(fc, mqueue_fill_super, ctx->ipc_ns); } static void mqueue_fs_context_free(struct fs_context *fc) @@ -647,18 +706,23 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr, wq_add(info, sr, ewp); for (;;) { + /* memory barrier not required, we hold info->lock */ __set_current_state(TASK_INTERRUPTIBLE); spin_unlock(&info->lock); time = schedule_hrtimeout_range_clock(timeout, 0, HRTIMER_MODE_ABS, CLOCK_REALTIME); - if (ewp->state == STATE_READY) { + if (READ_ONCE(ewp->state) == STATE_READY) { + /* see MQ_BARRIER for purpose/pairing */ + smp_acquire__after_ctrl_dep(); retval = 0; goto out; } spin_lock(&info->lock); - if (ewp->state == STATE_READY) { + + /* we hold info->lock, so no memory barrier required */ + if (READ_ONCE(ewp->state) == STATE_READY) { retval = 0; goto out_unlock; } @@ -919,6 +983,18 @@ out_name: * The same algorithm is used for senders. */ +static inline void __pipelined_op(struct wake_q_head *wake_q, + struct mqueue_inode_info *info, + struct ext_wait_queue *this) +{ + list_del(&this->list); + get_task_struct(this->task); + + /* see MQ_BARRIER for purpose/pairing */ + smp_store_release(&this->state, STATE_READY); + wake_q_add_safe(wake_q, this->task); +} + /* pipelined_send() - send a message directly to the task waiting in * sys_mq_timedreceive() (without inserting message into a queue). */ @@ -928,17 +1004,7 @@ static inline void pipelined_send(struct wake_q_head *wake_q, struct ext_wait_queue *receiver) { receiver->msg = message; - list_del(&receiver->list); - wake_q_add(wake_q, receiver->task); - /* - * Rely on the implicit cmpxchg barrier from wake_q_add such - * that we can ensure that updating receiver->state is the last - * write operation: As once set, the receiver can continue, - * and if we don't have the reference count from the wake_q, - * yet, at that point we can later have a use-after-free - * condition and bogus wakeup. - */ - receiver->state = STATE_READY; + __pipelined_op(wake_q, info, receiver); } /* pipelined_receive() - if there is task waiting in sys_mq_timedsend() @@ -956,9 +1022,7 @@ static inline void pipelined_receive(struct wake_q_head *wake_q, if (msg_insert(sender->msg, info)) return; - list_del(&sender->list); - wake_q_add(wake_q, sender->task); - sender->state = STATE_READY; + __pipelined_op(wake_q, info, sender); } static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr, @@ -1045,7 +1109,9 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr, } else { wait.task = current; wait.msg = (void *) msg_ptr; - wait.state = STATE_NONE; + + /* memory barrier not required, we hold info->lock */ + WRITE_ONCE(wait.state, STATE_NONE); ret = wq_sleep(info, SEND, timeout, &wait); /* * wq_sleep must be called with info->lock held, and @@ -1148,7 +1214,9 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, ret = -EAGAIN; } else { wait.task = current; - wait.state = STATE_NONE; + + /* memory barrier not required, we hold info->lock */ + WRITE_ONCE(wait.state, STATE_NONE); ret = wq_sleep(info, RECV, timeout, &wait); msg_ptr = wait.msg; } @@ -1241,15 +1309,14 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification) /* create the notify skb */ nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL); - if (!nc) { - ret = -ENOMEM; - goto out; - } + if (!nc) + return -ENOMEM; + if (copy_from_user(nc->data, notification->sigev_value.sival_ptr, NOTIFY_COOKIE_LEN)) { ret = -EFAULT; - goto out; + goto free_skb; } /* TODO: add a header? */ @@ -1265,8 +1332,7 @@ retry: fdput(f); if (IS_ERR(sock)) { ret = PTR_ERR(sock); - sock = NULL; - goto out; + goto free_skb; } timeo = MAX_SCHEDULE_TIMEOUT; @@ -1275,11 +1341,8 @@ retry: sock = NULL; goto retry; } - if (ret) { - sock = NULL; - nc = NULL; - goto out; - } + if (ret) + return ret; } } @@ -1334,7 +1397,8 @@ out_fput: out: if (sock) netlink_detachskb(sock, nc); - else if (nc) + else +free_skb: dev_kfree_skb(nc); return ret; diff --git a/ipc/msg.c b/ipc/msg.c index 8dec945fa030..caca67368cb5 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -61,6 +61,16 @@ struct msg_queue { struct list_head q_senders; } __randomize_layout; +/* + * MSG_BARRIER Locking: + * + * Similar to the optimization used in ipc/mqueue.c, one syscall return path + * does not acquire any locks when it sees that a message exists in + * msg_receiver.r_msg. Therefore r_msg is set using smp_store_release() + * and accessed using READ_ONCE()+smp_acquire__after_ctrl_dep(). In addition, + * wake_q_add_safe() is used. See ipc/mqueue.c for more details + */ + /* one msg_receiver structure for each sleeping receiver */ struct msg_receiver { struct list_head r_list; @@ -184,6 +194,10 @@ static inline void ss_add(struct msg_queue *msq, { mss->tsk = current; mss->msgsz = msgsz; + /* + * No memory barrier required: we did ipc_lock_object(), + * and the waker obtains that lock before calling wake_q_add(). + */ __set_current_state(TASK_INTERRUPTIBLE); list_add_tail(&mss->list, &msq->q_senders); } @@ -237,8 +251,11 @@ static void expunge_all(struct msg_queue *msq, int res, struct msg_receiver *msr, *t; list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) { - wake_q_add(wake_q, msr->r_tsk); - WRITE_ONCE(msr->r_msg, ERR_PTR(res)); + get_task_struct(msr->r_tsk); + + /* see MSG_BARRIER for purpose/pairing */ + smp_store_release(&msr->r_msg, ERR_PTR(res)); + wake_q_add_safe(wake_q, msr->r_tsk); } } @@ -377,7 +394,7 @@ copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version) * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, - struct msqid64_ds *msqid64) + struct ipc64_perm *perm, int msg_qbytes) { struct kern_ipc_perm *ipcp; struct msg_queue *msq; @@ -387,7 +404,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, rcu_read_lock(); ipcp = ipcctl_obtain_check(ns, &msg_ids(ns), msqid, cmd, - &msqid64->msg_perm, msqid64->msg_qbytes); + perm, msg_qbytes); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); goto out_unlock1; @@ -409,18 +426,18 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, { DEFINE_WAKE_Q(wake_q); - if (msqid64->msg_qbytes > ns->msg_ctlmnb && + if (msg_qbytes > ns->msg_ctlmnb && !capable(CAP_SYS_RESOURCE)) { err = -EPERM; goto out_unlock1; } ipc_lock_object(&msq->q_perm); - err = ipc_update_perm(&msqid64->msg_perm, ipcp); + err = ipc_update_perm(perm, ipcp); if (err) goto out_unlock0; - msq->q_qbytes = msqid64->msg_qbytes; + msq->q_qbytes = msg_qbytes; msq->q_ctime = ktime_get_real_seconds(); /* @@ -601,9 +618,10 @@ static long ksys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf, int ver case IPC_SET: if (copy_msqid_from_user(&msqid64, buf, version)) return -EFAULT; - /* fallthru */ + return msgctl_down(ns, msqid, cmd, &msqid64.msg_perm, + msqid64.msg_qbytes); case IPC_RMID: - return msgctl_down(ns, msqid, cmd, &msqid64); + return msgctl_down(ns, msqid, cmd, NULL, 0); default: return -EINVAL; } @@ -735,9 +753,9 @@ static long compat_ksys_msgctl(int msqid, int cmd, void __user *uptr, int versio case IPC_SET: if (copy_compat_msqid_from_user(&msqid64, uptr, version)) return -EFAULT; - /* fallthru */ + return msgctl_down(ns, msqid, cmd, &msqid64.msg_perm, msqid64.msg_qbytes); case IPC_RMID: - return msgctl_down(ns, msqid, cmd, &msqid64); + return msgctl_down(ns, msqid, cmd, NULL, 0); default: return -EINVAL; } @@ -798,13 +816,17 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg, list_del(&msr->r_list); if (msr->r_maxsize < msg->m_ts) { wake_q_add(wake_q, msr->r_tsk); - WRITE_ONCE(msr->r_msg, ERR_PTR(-E2BIG)); + + /* See expunge_all regarding memory barrier */ + smp_store_release(&msr->r_msg, ERR_PTR(-E2BIG)); } else { ipc_update_pid(&msq->q_lrpid, task_pid(msr->r_tsk)); msq->q_rtime = ktime_get_real_seconds(); wake_q_add(wake_q, msr->r_tsk); - WRITE_ONCE(msr->r_msg, msg); + + /* See expunge_all regarding memory barrier */ + smp_store_release(&msr->r_msg, msg); return 1; } } @@ -1154,7 +1176,11 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in msr_d.r_maxsize = INT_MAX; else msr_d.r_maxsize = bufsz; - msr_d.r_msg = ERR_PTR(-EAGAIN); + + /* memory barrier not require due to ipc_lock_object() */ + WRITE_ONCE(msr_d.r_msg, ERR_PTR(-EAGAIN)); + + /* memory barrier not required, we own ipc_lock_object() */ __set_current_state(TASK_INTERRUPTIBLE); ipc_unlock_object(&msq->q_perm); @@ -1183,8 +1209,12 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in * signal) it will either see the message and continue ... */ msg = READ_ONCE(msr_d.r_msg); - if (msg != ERR_PTR(-EAGAIN)) + if (msg != ERR_PTR(-EAGAIN)) { + /* see MSG_BARRIER for purpose/pairing */ + smp_acquire__after_ctrl_dep(); + goto out_unlock1; + } /* * ... or see -EAGAIN, acquire the lock to check the message @@ -1192,7 +1222,7 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in */ ipc_lock_object(&msq->q_perm); - msg = msr_d.r_msg; + msg = READ_ONCE(msr_d.r_msg); if (msg != ERR_PTR(-EAGAIN)) goto out_unlock0; diff --git a/ipc/sem.c b/ipc/sem.c index 7da4504bcc7c..4f4303f32077 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -205,15 +205,38 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); * * Memory ordering: * Most ordering is enforced by using spin_lock() and spin_unlock(). - * The special case is use_global_lock: + * + * Exceptions: + * 1) use_global_lock: (SEM_BARRIER_1) * Setting it from non-zero to 0 is a RELEASE, this is ensured by - * using smp_store_release(). + * using smp_store_release(): Immediately after setting it to 0, + * a simple op can start. * Testing if it is non-zero is an ACQUIRE, this is ensured by using * smp_load_acquire(). * Setting it from 0 to non-zero must be ordered with regards to * this smp_load_acquire(), this is guaranteed because the smp_load_acquire() * is inside a spin_lock() and after a write from 0 to non-zero a * spin_lock()+spin_unlock() is done. + * + * 2) queue.status: (SEM_BARRIER_2) + * Initialization is done while holding sem_lock(), so no further barrier is + * required. + * Setting it to a result code is a RELEASE, this is ensured by both a + * smp_store_release() (for case a) and while holding sem_lock() + * (for case b). + * The AQUIRE when reading the result code without holding sem_lock() is + * achieved by using READ_ONCE() + smp_acquire__after_ctrl_dep(). + * (case a above). + * Reading the result code while holding sem_lock() needs no further barriers, + * the locks inside sem_lock() enforce ordering (case b above) + * + * 3) current->state: + * current->state is set to TASK_INTERRUPTIBLE while holding sem_lock(). + * The wakeup is handled using the wake_q infrastructure. wake_q wakeups may + * happen immediately after calling wake_q_add. As wake_q_add_safe() is called + * when holding sem_lock(), no further barriers are required. + * + * See also ipc/mqueue.c for more details on the covered races. */ #define sc_semmsl sem_ctls[0] @@ -344,12 +367,8 @@ static void complexmode_tryleave(struct sem_array *sma) return; } if (sma->use_global_lock == 1) { - /* - * Immediately after setting use_global_lock to 0, - * a simple op can start. Thus: all memory writes - * performed by the current operation must be visible - * before we set use_global_lock to 0. - */ + + /* See SEM_BARRIER_1 for purpose/pairing */ smp_store_release(&sma->use_global_lock, 0); } else { sma->use_global_lock--; @@ -400,7 +419,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, */ spin_lock(&sem->lock); - /* pairs with smp_store_release() */ + /* see SEM_BARRIER_1 for purpose/pairing */ if (!smp_load_acquire(&sma->use_global_lock)) { /* fast path successful! */ return sops->sem_num; @@ -766,15 +785,12 @@ would_block: static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error, struct wake_q_head *wake_q) { - wake_q_add(wake_q, q->sleeper); - /* - * Rely on the above implicit barrier, such that we can - * ensure that we hold reference to the task before setting - * q->status. Otherwise we could race with do_exit if the - * task is awoken by an external event before calling - * wake_up_process(). - */ - WRITE_ONCE(q->status, error); + get_task_struct(q->sleeper); + + /* see SEM_BARRIER_2 for purpuse/pairing */ + smp_store_release(&q->status, error); + + wake_q_add_safe(wake_q, q->sleeper); } static void unlink_queue(struct sem_array *sma, struct sem_queue *q) @@ -1852,7 +1868,8 @@ static struct sem_undo *__lookup_undo(struct sem_undo_list *ulp, int semid) { struct sem_undo *un; - list_for_each_entry_rcu(un, &ulp->list_proc, list_proc) { + list_for_each_entry_rcu(un, &ulp->list_proc, list_proc, + spin_is_locked(&ulp->lock)) { if (un->semid == semid) return un; } @@ -2147,9 +2164,11 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, } do { + /* memory ordering ensured by the lock in sem_lock() */ WRITE_ONCE(queue.status, -EINTR); queue.sleeper = current; + /* memory ordering is ensured by the lock in sem_lock() */ __set_current_state(TASK_INTERRUPTIBLE); sem_unlock(sma, locknum); rcu_read_unlock(); @@ -2172,13 +2191,8 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, */ error = READ_ONCE(queue.status); if (error != -EINTR) { - /* - * User space could assume that semop() is a memory - * barrier: Without the mb(), the cpu could - * speculatively read in userspace stale data that was - * overwritten by the previous owner of the semaphore. - */ - smp_mb(); + /* see SEM_BARRIER_2 for purpose/pairing */ + smp_acquire__after_ctrl_dep(); goto out_free; } @@ -2188,6 +2202,9 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, if (!ipc_valid_object(&sma->sem_perm)) goto out_unlock_free; + /* + * No necessity for any barrier: We are protect by sem_lock() + */ error = READ_ONCE(queue.status); /* diff --git a/ipc/syscall.c b/ipc/syscall.c index 581bdff4e7c5..dfb0e988d542 100644 --- a/ipc/syscall.c +++ b/ipc/syscall.c @@ -30,7 +30,7 @@ int ksys_ipc(unsigned int call, int first, unsigned long second, return ksys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL); case SEMTIMEDOP: - if (IS_ENABLED(CONFIG_64BIT) || !IS_ENABLED(CONFIG_64BIT_TIME)) + if (IS_ENABLED(CONFIG_64BIT)) return ksys_semtimedop(first, ptr, second, (const struct __kernel_timespec __user *)fifth); else if (IS_ENABLED(CONFIG_COMPAT_32BIT_TIME)) diff --git a/ipc/util.c b/ipc/util.c index d126d156efc6..fe61df53775a 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -100,7 +100,7 @@ device_initcall(ipc_init); static const struct rhashtable_params ipc_kht_params = { .head_offset = offsetof(struct kern_ipc_perm, khtnode), .key_offset = offsetof(struct kern_ipc_perm, key), - .key_len = FIELD_SIZEOF(struct kern_ipc_perm, key), + .key_len = sizeof_field(struct kern_ipc_perm, key), .automatic_shrinking = true, }; @@ -126,7 +126,7 @@ void ipc_init_ids(struct ipc_ids *ids) } #ifdef CONFIG_PROC_FS -static const struct file_operations sysvipc_proc_fops; +static const struct proc_ops sysvipc_proc_ops; /** * ipc_init_proc_interface - create a proc interface for sysipc types using a seq_file interface. * @path: Path in procfs @@ -151,7 +151,7 @@ void __init ipc_init_proc_interface(const char *path, const char *header, pde = proc_create_data(path, S_IRUGO, /* world readable */ NULL, /* parent dir */ - &sysvipc_proc_fops, + &sysvipc_proc_ops, iface); if (!pde) kfree(iface); @@ -884,10 +884,10 @@ static int sysvipc_proc_release(struct inode *inode, struct file *file) return seq_release_private(inode, file); } -static const struct file_operations sysvipc_proc_fops = { - .open = sysvipc_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = sysvipc_proc_release, +static const struct proc_ops sysvipc_proc_ops = { + .proc_open = sysvipc_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = sysvipc_proc_release, }; #endif /* CONFIG_PROC_FS */ diff --git a/ipc/util.h b/ipc/util.h index 0fcf8e719b76..5766c61aed0e 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -276,29 +276,7 @@ static inline int compat_ipc_parse_version(int *cmd) *cmd &= ~IPC_64; return version; } -#endif -/* for __ARCH_WANT_SYS_IPC */ -long ksys_semtimedop(int semid, struct sembuf __user *tsops, - unsigned int nsops, - const struct __kernel_timespec __user *timeout); -long ksys_semget(key_t key, int nsems, int semflg); -long ksys_old_semctl(int semid, int semnum, int cmd, unsigned long arg); -long ksys_msgget(key_t key, int msgflg); -long ksys_old_msgctl(int msqid, int cmd, struct msqid_ds __user *buf); -long ksys_msgrcv(int msqid, struct msgbuf __user *msgp, size_t msgsz, - long msgtyp, int msgflg); -long ksys_msgsnd(int msqid, struct msgbuf __user *msgp, size_t msgsz, - int msgflg); -long ksys_shmget(key_t key, size_t size, int shmflg); -long ksys_shmdt(char __user *shmaddr); -long ksys_old_shmctl(int shmid, int cmd, struct shmid_ds __user *buf); - -/* for CONFIG_ARCH_WANT_OLD_COMPAT_IPC */ -long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems, - unsigned int nsops, - const struct old_timespec32 __user *timeout); -#ifdef CONFIG_COMPAT long compat_ksys_old_semctl(int semid, int semnum, int cmd, int arg); long compat_ksys_old_msgctl(int msqid, int cmd, void __user *uptr); long compat_ksys_msgrcv(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz, @@ -306,6 +284,7 @@ long compat_ksys_msgrcv(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz, long compat_ksys_msgsnd(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz, int msgflg); long compat_ksys_old_shmctl(int shmid, int cmd, void __user *uptr); -#endif /* CONFIG_COMPAT */ + +#endif #endif |

