From 9552389c465ed1ded39edf4a5642a861b53c2955 Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Tue, 2 Jul 2019 14:39:20 +0300 Subject: crypto: fips - add FIPS test failure notification chain Crypto test failures in FIPS mode cause an immediate panic, but on some system the cryptographic boundary extends beyond just the Linux controlled domain. Add a simple atomic notification chain to allow interested parties to register to receive notification prior to us kicking the bucket. Signed-off-by: Gilad Ben-Yossef Signed-off-by: Herbert Xu --- include/linux/fips.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fips.h b/include/linux/fips.h index afeeece92302..c6961e932fef 100644 --- a/include/linux/fips.h +++ b/include/linux/fips.h @@ -4,8 +4,15 @@ #ifdef CONFIG_CRYPTO_FIPS extern int fips_enabled; +extern struct atomic_notifier_head fips_fail_notif_chain; + +void fips_fail_notify(void); + #else #define fips_enabled 0 + +static inline void fips_fail_notify(void) {} + #endif #endif -- cgit v1.2.3 From 6fc4dbcf0276279d488c5fbbfabe94734134f4fa Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 18 Jul 2019 23:01:46 +0800 Subject: padata: Replace delayed timer with immediate workqueue in padata_reorder The function padata_reorder will use a timer when it cannot progress while completed jobs are outstanding (pd->reorder_objects > 0). This is suboptimal as if we do end up using the timer then it would have introduced a gratuitous delay of one second. In fact we can easily distinguish between whether completed jobs are outstanding and whether we can make progress. All we have to do is look at the next pqueue list. This patch does that by replacing pd->processed with pd->cpu so that the next pqueue is more accessible. A work queue is used instead of the original try_again to avoid hogging the CPU. Note that we don't bother removing the work queue in padata_flush_queues because the whole premise is broken. You cannot flush async crypto requests so it makes no sense to even try. A subsequent patch will fix it by replacing it with a ref counting scheme. Signed-off-by: Herbert Xu --- include/linux/padata.h | 13 +++---- kernel/padata.c | 97 ++++++++++---------------------------------------- 2 files changed, 22 insertions(+), 88 deletions(-) (limited to 'include/linux') diff --git a/include/linux/padata.h b/include/linux/padata.h index 56f09e36f770..8da673861d99 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -73,18 +72,14 @@ struct padata_serial_queue { * @serial: List to wait for serialization after reordering. * @pwork: work struct for parallelization. * @swork: work struct for serialization. - * @pd: Backpointer to the internal control structure. * @work: work struct for parallelization. - * @reorder_work: work struct for reordering. * @num_obj: Number of objects that are processed by this cpu. * @cpu_index: Index of the cpu. */ struct padata_parallel_queue { struct padata_list parallel; struct padata_list reorder; - struct parallel_data *pd; struct work_struct work; - struct work_struct reorder_work; atomic_t num_obj; int cpu_index; }; @@ -110,10 +105,10 @@ struct padata_cpumask { * @reorder_objects: Number of objects waiting in the reorder queues. * @refcnt: Number of objects holding a reference on this parallel_data. * @max_seq_nr: Maximal used sequence number. + * @cpu: Next CPU to be processed. * @cpumask: The cpumasks in use for parallel and serial workers. + * @reorder_work: work struct for reordering. * @lock: Reorder lock. - * @processed: Number of already processed objects. - * @timer: Reorder timer. */ struct parallel_data { struct padata_instance *pinst; @@ -122,10 +117,10 @@ struct parallel_data { atomic_t reorder_objects; atomic_t refcnt; atomic_t seq_nr; + int cpu; struct padata_cpumask cpumask; + struct work_struct reorder_work; spinlock_t lock ____cacheline_aligned; - unsigned int processed; - struct timer_list timer; }; /** diff --git a/kernel/padata.c b/kernel/padata.c index 15a8ad63f4ff..fbafca18597f 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -165,23 +165,12 @@ EXPORT_SYMBOL(padata_do_parallel); */ static struct padata_priv *padata_get_next(struct parallel_data *pd) { - int cpu, num_cpus; - unsigned int next_nr, next_index; struct padata_parallel_queue *next_queue; struct padata_priv *padata; struct padata_list *reorder; + int cpu = pd->cpu; - num_cpus = cpumask_weight(pd->cpumask.pcpu); - - /* - * Calculate the percpu reorder queue and the sequence - * number of the next object. - */ - next_nr = pd->processed; - next_index = next_nr % num_cpus; - cpu = padata_index_to_cpu(pd, next_index); next_queue = per_cpu_ptr(pd->pqueue, cpu); - reorder = &next_queue->reorder; spin_lock(&reorder->lock); @@ -192,7 +181,8 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) list_del_init(&padata->list); atomic_dec(&pd->reorder_objects); - pd->processed++; + pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, + false); spin_unlock(&reorder->lock); goto out; @@ -215,6 +205,7 @@ static void padata_reorder(struct parallel_data *pd) struct padata_priv *padata; struct padata_serial_queue *squeue; struct padata_instance *pinst = pd->pinst; + struct padata_parallel_queue *next_queue; /* * We need to ensure that only one cpu can work on dequeueing of @@ -246,7 +237,6 @@ static void padata_reorder(struct parallel_data *pd) * so exit immediately. */ if (PTR_ERR(padata) == -ENODATA) { - del_timer(&pd->timer); spin_unlock_bh(&pd->lock); return; } @@ -265,70 +255,29 @@ static void padata_reorder(struct parallel_data *pd) /* * The next object that needs serialization might have arrived to - * the reorder queues in the meantime, we will be called again - * from the timer function if no one else cares for it. + * the reorder queues in the meantime. * - * Ensure reorder_objects is read after pd->lock is dropped so we see - * an increment from another task in padata_do_serial. Pairs with + * Ensure reorder queue is read after pd->lock is dropped so we see + * new objects from another task in padata_do_serial. Pairs with * smp_mb__after_atomic in padata_do_serial. */ smp_mb(); - if (atomic_read(&pd->reorder_objects) - && !(pinst->flags & PADATA_RESET)) - mod_timer(&pd->timer, jiffies + HZ); - else - del_timer(&pd->timer); - return; + next_queue = per_cpu_ptr(pd->pqueue, pd->cpu); + if (!list_empty(&next_queue->reorder.list)) + queue_work(pinst->wq, &pd->reorder_work); } static void invoke_padata_reorder(struct work_struct *work) { - struct padata_parallel_queue *pqueue; struct parallel_data *pd; local_bh_disable(); - pqueue = container_of(work, struct padata_parallel_queue, reorder_work); - pd = pqueue->pd; + pd = container_of(work, struct parallel_data, reorder_work); padata_reorder(pd); local_bh_enable(); } -static void padata_reorder_timer(struct timer_list *t) -{ - struct parallel_data *pd = from_timer(pd, t, timer); - unsigned int weight; - int target_cpu, cpu; - - cpu = get_cpu(); - - /* We don't lock pd here to not interfere with parallel processing - * padata_reorder() calls on other CPUs. We just need any CPU out of - * the cpumask.pcpu set. It would be nice if it's the right one but - * it doesn't matter if we're off to the next one by using an outdated - * pd->processed value. - */ - weight = cpumask_weight(pd->cpumask.pcpu); - target_cpu = padata_index_to_cpu(pd, pd->processed % weight); - - /* ensure to call the reorder callback on the correct CPU */ - if (cpu != target_cpu) { - struct padata_parallel_queue *pqueue; - struct padata_instance *pinst; - - /* The timer function is serialized wrt itself -- no locking - * needed. - */ - pinst = pd->pinst; - pqueue = per_cpu_ptr(pd->pqueue, target_cpu); - queue_work_on(target_cpu, pinst->wq, &pqueue->reorder_work); - } else { - padata_reorder(pd); - } - - put_cpu(); -} - static void padata_serial_worker(struct work_struct *serial_work) { struct padata_serial_queue *squeue; @@ -376,9 +325,8 @@ void padata_do_serial(struct padata_priv *padata) cpu = get_cpu(); - /* We need to run on the same CPU padata_do_parallel(.., padata, ..) - * was called on -- or, at least, enqueue the padata object into the - * correct per-cpu queue. + /* We need to enqueue the padata object into the correct + * per-cpu queue. */ if (cpu != padata->cpu) { reorder_via_wq = 1; @@ -388,12 +336,12 @@ void padata_do_serial(struct padata_priv *padata) pqueue = per_cpu_ptr(pd->pqueue, cpu); spin_lock(&pqueue->reorder.lock); - atomic_inc(&pd->reorder_objects); list_add_tail(&padata->list, &pqueue->reorder.list); + atomic_inc(&pd->reorder_objects); spin_unlock(&pqueue->reorder.lock); /* - * Ensure the atomic_inc of reorder_objects above is ordered correctly + * Ensure the addition to the reorder list is ordered correctly * with the trylock of pd->lock in padata_reorder. Pairs with smp_mb * in padata_reorder. */ @@ -401,13 +349,7 @@ void padata_do_serial(struct padata_priv *padata) put_cpu(); - /* If we're running on the wrong CPU, call padata_reorder() via a - * kernel worker. - */ - if (reorder_via_wq) - queue_work_on(cpu, pd->pinst->wq, &pqueue->reorder_work); - else - padata_reorder(pd); + padata_reorder(pd); } EXPORT_SYMBOL(padata_do_serial); @@ -463,14 +405,12 @@ static void padata_init_pqueues(struct parallel_data *pd) continue; } - pqueue->pd = pd; pqueue->cpu_index = cpu_index; cpu_index++; __padata_list_init(&pqueue->reorder); __padata_list_init(&pqueue->parallel); INIT_WORK(&pqueue->work, padata_parallel_worker); - INIT_WORK(&pqueue->reorder_work, invoke_padata_reorder); atomic_set(&pqueue->num_obj, 0); } } @@ -498,12 +438,13 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, padata_init_pqueues(pd); padata_init_squeues(pd); - timer_setup(&pd->timer, padata_reorder_timer, 0); atomic_set(&pd->seq_nr, -1); atomic_set(&pd->reorder_objects, 0); atomic_set(&pd->refcnt, 0); pd->pinst = pinst; spin_lock_init(&pd->lock); + pd->cpu = cpumask_first(pcpumask); + INIT_WORK(&pd->reorder_work, invoke_padata_reorder); return pd; @@ -538,8 +479,6 @@ static void padata_flush_queues(struct parallel_data *pd) flush_work(&pqueue->work); } - del_timer_sync(&pd->timer); - if (atomic_read(&pd->reorder_objects)) padata_reorder(pd); -- cgit v1.2.3 From 2ef540476e0077ca2f9992fde9a014891000c4b5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 29 Jul 2019 00:32:36 +0900 Subject: hwrng: timeriomem - add include guard to timeriomem-rng.h Add a header include guard just in case. Signed-off-by: Masahiro Yamada Signed-off-by: Herbert Xu --- include/linux/timeriomem-rng.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/timeriomem-rng.h b/include/linux/timeriomem-rng.h index fd4a6e6ec831..672df7fbf6c1 100644 --- a/include/linux/timeriomem-rng.h +++ b/include/linux/timeriomem-rng.h @@ -5,6 +5,9 @@ * Copyright (c) 2009 Alexander Clouter */ +#ifndef _LINUX_TIMERIOMEM_RNG_H +#define _LINUX_TIMERIOMEM_RNG_H + struct timeriomem_rng_data { void __iomem *address; @@ -14,3 +17,5 @@ struct timeriomem_rng_data { /* bits of entropy per 1024 bits read */ unsigned int quality; }; + +#endif /* _LINUX_TIMERIOMEM_RNG_H */ -- cgit v1.2.3 From ad767ee858b38af634c957a792cb001d54a7b981 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 17 Aug 2019 16:24:31 +0200 Subject: crypto: sha256 - Move lib/sha256.c to lib/crypto Generic crypto implementations belong under lib/crypto not directly in lib, likewise the header should be in include/crypto, not include/linux. Note that the code in lib/crypto/sha256.c is not yet available for generic use after this commit, it is still only used by the s390 and x86 purgatory code. Making it suitable for generic use is done in further patches in this series. Signed-off-by: Hans de Goede Signed-off-by: Herbert Xu --- arch/s390/purgatory/Makefile | 2 +- arch/s390/purgatory/purgatory.c | 2 +- arch/x86/purgatory/Makefile | 2 +- arch/x86/purgatory/purgatory.c | 2 +- include/crypto/sha256.h | 28 ++++ include/linux/sha256.h | 28 ---- lib/crypto/sha256.c | 279 ++++++++++++++++++++++++++++++++++++++++ lib/sha256.c | 279 ---------------------------------------- 8 files changed, 311 insertions(+), 311 deletions(-) create mode 100644 include/crypto/sha256.h delete mode 100644 include/linux/sha256.h create mode 100644 lib/crypto/sha256.c delete mode 100644 lib/sha256.c (limited to 'include/linux') diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile index dc1ae4ff79d7..85b05c9e40f5 100644 --- a/arch/s390/purgatory/Makefile +++ b/arch/s390/purgatory/Makefile @@ -7,7 +7,7 @@ purgatory-y := head.o purgatory.o string.o sha256.o mem.o targets += $(purgatory-y) purgatory.lds purgatory purgatory.ro PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) -$(obj)/sha256.o: $(srctree)/lib/sha256.c FORCE +$(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE $(call if_changed_rule,cc_o_c) $(obj)/mem.o: $(srctree)/arch/s390/lib/mem.S FORCE diff --git a/arch/s390/purgatory/purgatory.c b/arch/s390/purgatory/purgatory.c index 3528e6da4e87..a80c78da9985 100644 --- a/arch/s390/purgatory/purgatory.c +++ b/arch/s390/purgatory/purgatory.c @@ -8,8 +8,8 @@ */ #include -#include #include +#include #include int verify_sha256_digest(void) diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 3cf302b26332..026fa0006f0b 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -6,7 +6,7 @@ purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string targets += $(purgatory-y) PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) -$(obj)/sha256.o: $(srctree)/lib/sha256.c FORCE +$(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE $(call if_changed_rule,cc_o_c) LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index 6d8d5a34c377..7cd7a2618180 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -9,7 +9,7 @@ */ #include -#include +#include #include #include "../boot/string.h" diff --git a/include/crypto/sha256.h b/include/crypto/sha256.h new file mode 100644 index 000000000000..b1f9c6781082 --- /dev/null +++ b/include/crypto/sha256.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2014 Red Hat Inc. + * + * Author: Vivek Goyal + */ + +#ifndef SHA256_H +#define SHA256_H + +#include +#include + +/* + * Stand-alone implementation of the SHA256 algorithm. It is designed to + * have as little dependencies as possible so it can be used in the + * kexec_file purgatory. In other cases you should use the implementation in + * crypto/. + * + * For details see lib/crypto/sha256.c + */ + +extern int sha256_init(struct sha256_state *sctx); +extern int sha256_update(struct sha256_state *sctx, const u8 *input, + unsigned int length); +extern int sha256_final(struct sha256_state *sctx, u8 *hash); + +#endif /* SHA256_H */ diff --git a/include/linux/sha256.h b/include/linux/sha256.h deleted file mode 100644 index 26972b9e92db..000000000000 --- a/include/linux/sha256.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2014 Red Hat Inc. - * - * Author: Vivek Goyal - */ - -#ifndef SHA256_H -#define SHA256_H - -#include -#include - -/* - * Stand-alone implementation of the SHA256 algorithm. It is designed to - * have as little dependencies as possible so it can be used in the - * kexec_file purgatory. In other cases you should use the implementation in - * crypto/. - * - * For details see lib/sha256.c - */ - -extern int sha256_init(struct sha256_state *sctx); -extern int sha256_update(struct sha256_state *sctx, const u8 *input, - unsigned int length); -extern int sha256_final(struct sha256_state *sctx, u8 *hash); - -#endif /* SHA256_H */ diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c new file mode 100644 index 000000000000..b8114028d06f --- /dev/null +++ b/lib/crypto/sha256.c @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * SHA-256, as specified in + * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf + * + * SHA-256 code by Jean-Luc Cooke . + * + * Copyright (c) Jean-Luc Cooke + * Copyright (c) Andrew McDonald + * Copyright (c) 2002 James Morris + * Copyright (c) 2014 Red Hat Inc. + */ + +#include +#include +#include +#include + +static inline u32 Ch(u32 x, u32 y, u32 z) +{ + return z ^ (x & (y ^ z)); +} + +static inline u32 Maj(u32 x, u32 y, u32 z) +{ + return (x & y) | (z & (x | y)); +} + +#define e0(x) (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22)) +#define e1(x) (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25)) +#define s0(x) (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3)) +#define s1(x) (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10)) + +static inline void LOAD_OP(int I, u32 *W, const u8 *input) +{ + W[I] = __be32_to_cpu(((__be32 *)(input))[I]); +} + +static inline void BLEND_OP(int I, u32 *W) +{ + W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16]; +} + +static void sha256_transform(u32 *state, const u8 *input) +{ + u32 a, b, c, d, e, f, g, h, t1, t2; + u32 W[64]; + int i; + + /* load the input */ + for (i = 0; i < 16; i++) + LOAD_OP(i, W, input); + + /* now blend */ + for (i = 16; i < 64; i++) + BLEND_OP(i, W); + + /* load the state into our registers */ + a = state[0]; b = state[1]; c = state[2]; d = state[3]; + e = state[4]; f = state[5]; g = state[6]; h = state[7]; + + /* now iterate */ + t1 = h + e1(e) + Ch(e, f, g) + 0x428a2f98 + W[0]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x71374491 + W[1]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; + t1 = f + e1(c) + Ch(c, d, e) + 0xb5c0fbcf + W[2]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; + t1 = e + e1(b) + Ch(b, c, d) + 0xe9b5dba5 + W[3]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x3956c25b + W[4]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x59f111f1 + W[5]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x923f82a4 + W[6]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; + t1 = a + e1(f) + Ch(f, g, h) + 0xab1c5ed5 + W[7]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0xd807aa98 + W[8]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x12835b01 + W[9]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x243185be + W[10]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x550c7dc3 + W[11]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x72be5d74 + W[12]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x80deb1fe + W[13]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x9bdc06a7 + W[14]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; + t1 = a + e1(f) + Ch(f, g, h) + 0xc19bf174 + W[15]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0xe49b69c1 + W[16]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; + t1 = g + e1(d) + Ch(d, e, f) + 0xefbe4786 + W[17]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x0fc19dc6 + W[18]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x240ca1cc + W[19]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x2de92c6f + W[20]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x4a7484aa + W[21]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x5cb0a9dc + W[22]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x76f988da + W[23]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x983e5152 + W[24]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; + t1 = g + e1(d) + Ch(d, e, f) + 0xa831c66d + W[25]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; + t1 = f + e1(c) + Ch(c, d, e) + 0xb00327c8 + W[26]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; + t1 = e + e1(b) + Ch(b, c, d) + 0xbf597fc7 + W[27]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; + t1 = d + e1(a) + Ch(a, b, c) + 0xc6e00bf3 + W[28]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; + t1 = c + e1(h) + Ch(h, a, b) + 0xd5a79147 + W[29]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x06ca6351 + W[30]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x14292967 + W[31]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x27b70a85 + W[32]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x2e1b2138 + W[33]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x4d2c6dfc + W[34]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x53380d13 + W[35]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x650a7354 + W[36]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x766a0abb + W[37]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x81c2c92e + W[38]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x92722c85 + W[39]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0xa2bfe8a1 + W[40]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; + t1 = g + e1(d) + Ch(d, e, f) + 0xa81a664b + W[41]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; + t1 = f + e1(c) + Ch(c, d, e) + 0xc24b8b70 + W[42]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; + t1 = e + e1(b) + Ch(b, c, d) + 0xc76c51a3 + W[43]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; + t1 = d + e1(a) + Ch(a, b, c) + 0xd192e819 + W[44]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; + t1 = c + e1(h) + Ch(h, a, b) + 0xd6990624 + W[45]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; + t1 = b + e1(g) + Ch(g, h, a) + 0xf40e3585 + W[46]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x106aa070 + W[47]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x19a4c116 + W[48]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x1e376c08 + W[49]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x2748774c + W[50]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x34b0bcb5 + W[51]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x391c0cb3 + W[52]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x4ed8aa4a + W[53]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x5b9cca4f + W[54]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x682e6ff3 + W[55]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x748f82ee + W[56]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x78a5636f + W[57]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x84c87814 + W[58]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x8cc70208 + W[59]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x90befffa + W[60]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; + t1 = c + e1(h) + Ch(h, a, b) + 0xa4506ceb + W[61]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; + t1 = b + e1(g) + Ch(g, h, a) + 0xbef9a3f7 + W[62]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; + t1 = a + e1(f) + Ch(f, g, h) + 0xc67178f2 + W[63]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; + + state[0] += a; state[1] += b; state[2] += c; state[3] += d; + state[4] += e; state[5] += f; state[6] += g; state[7] += h; + + /* clear any sensitive info... */ + a = b = c = d = e = f = g = h = t1 = t2 = 0; + memset(W, 0, 64 * sizeof(u32)); +} + +int sha256_init(struct sha256_state *sctx) +{ + sctx->state[0] = SHA256_H0; + sctx->state[1] = SHA256_H1; + sctx->state[2] = SHA256_H2; + sctx->state[3] = SHA256_H3; + sctx->state[4] = SHA256_H4; + sctx->state[5] = SHA256_H5; + sctx->state[6] = SHA256_H6; + sctx->state[7] = SHA256_H7; + sctx->count = 0; + + return 0; +} + +int sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len) +{ + unsigned int partial, done; + const u8 *src; + + partial = sctx->count & 0x3f; + sctx->count += len; + done = 0; + src = data; + + if ((partial + len) > 63) { + if (partial) { + done = -partial; + memcpy(sctx->buf + partial, data, done + 64); + src = sctx->buf; + } + + do { + sha256_transform(sctx->state, src); + done += 64; + src = data + done; + } while (done + 63 < len); + + partial = 0; + } + memcpy(sctx->buf + partial, src, len - done); + + return 0; +} + +int sha256_final(struct sha256_state *sctx, u8 *out) +{ + __be32 *dst = (__be32 *)out; + __be64 bits; + unsigned int index, pad_len; + int i; + static const u8 padding[64] = { 0x80, }; + + /* Save number of bits */ + bits = cpu_to_be64(sctx->count << 3); + + /* Pad out to 56 mod 64. */ + index = sctx->count & 0x3f; + pad_len = (index < 56) ? (56 - index) : ((64+56) - index); + sha256_update(sctx, padding, pad_len); + + /* Append length (before padding) */ + sha256_update(sctx, (const u8 *)&bits, sizeof(bits)); + + /* Store state in digest */ + for (i = 0; i < 8; i++) + dst[i] = cpu_to_be32(sctx->state[i]); + + /* Zeroize sensitive information. */ + memset(sctx, 0, sizeof(*sctx)); + + return 0; +} diff --git a/lib/sha256.c b/lib/sha256.c deleted file mode 100644 index ba4dce0b3711..000000000000 --- a/lib/sha256.c +++ /dev/null @@ -1,279 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SHA-256, as specified in - * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf - * - * SHA-256 code by Jean-Luc Cooke . - * - * Copyright (c) Jean-Luc Cooke - * Copyright (c) Andrew McDonald - * Copyright (c) 2002 James Morris - * Copyright (c) 2014 Red Hat Inc. - */ - -#include -#include -#include -#include - -static inline u32 Ch(u32 x, u32 y, u32 z) -{ - return z ^ (x & (y ^ z)); -} - -static inline u32 Maj(u32 x, u32 y, u32 z) -{ - return (x & y) | (z & (x | y)); -} - -#define e0(x) (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22)) -#define e1(x) (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25)) -#define s0(x) (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3)) -#define s1(x) (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10)) - -static inline void LOAD_OP(int I, u32 *W, const u8 *input) -{ - W[I] = __be32_to_cpu(((__be32 *)(input))[I]); -} - -static inline void BLEND_OP(int I, u32 *W) -{ - W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16]; -} - -static void sha256_transform(u32 *state, const u8 *input) -{ - u32 a, b, c, d, e, f, g, h, t1, t2; - u32 W[64]; - int i; - - /* load the input */ - for (i = 0; i < 16; i++) - LOAD_OP(i, W, input); - - /* now blend */ - for (i = 16; i < 64; i++) - BLEND_OP(i, W); - - /* load the state into our registers */ - a = state[0]; b = state[1]; c = state[2]; d = state[3]; - e = state[4]; f = state[5]; g = state[6]; h = state[7]; - - /* now iterate */ - t1 = h + e1(e) + Ch(e, f, g) + 0x428a2f98 + W[0]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; - t1 = g + e1(d) + Ch(d, e, f) + 0x71374491 + W[1]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; - t1 = f + e1(c) + Ch(c, d, e) + 0xb5c0fbcf + W[2]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; - t1 = e + e1(b) + Ch(b, c, d) + 0xe9b5dba5 + W[3]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; - t1 = d + e1(a) + Ch(a, b, c) + 0x3956c25b + W[4]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; - t1 = c + e1(h) + Ch(h, a, b) + 0x59f111f1 + W[5]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; - t1 = b + e1(g) + Ch(g, h, a) + 0x923f82a4 + W[6]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; - t1 = a + e1(f) + Ch(f, g, h) + 0xab1c5ed5 + W[7]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0xd807aa98 + W[8]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; - t1 = g + e1(d) + Ch(d, e, f) + 0x12835b01 + W[9]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; - t1 = f + e1(c) + Ch(c, d, e) + 0x243185be + W[10]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; - t1 = e + e1(b) + Ch(b, c, d) + 0x550c7dc3 + W[11]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; - t1 = d + e1(a) + Ch(a, b, c) + 0x72be5d74 + W[12]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; - t1 = c + e1(h) + Ch(h, a, b) + 0x80deb1fe + W[13]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; - t1 = b + e1(g) + Ch(g, h, a) + 0x9bdc06a7 + W[14]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; - t1 = a + e1(f) + Ch(f, g, h) + 0xc19bf174 + W[15]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0xe49b69c1 + W[16]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; - t1 = g + e1(d) + Ch(d, e, f) + 0xefbe4786 + W[17]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; - t1 = f + e1(c) + Ch(c, d, e) + 0x0fc19dc6 + W[18]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; - t1 = e + e1(b) + Ch(b, c, d) + 0x240ca1cc + W[19]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; - t1 = d + e1(a) + Ch(a, b, c) + 0x2de92c6f + W[20]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; - t1 = c + e1(h) + Ch(h, a, b) + 0x4a7484aa + W[21]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; - t1 = b + e1(g) + Ch(g, h, a) + 0x5cb0a9dc + W[22]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; - t1 = a + e1(f) + Ch(f, g, h) + 0x76f988da + W[23]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0x983e5152 + W[24]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; - t1 = g + e1(d) + Ch(d, e, f) + 0xa831c66d + W[25]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; - t1 = f + e1(c) + Ch(c, d, e) + 0xb00327c8 + W[26]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; - t1 = e + e1(b) + Ch(b, c, d) + 0xbf597fc7 + W[27]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; - t1 = d + e1(a) + Ch(a, b, c) + 0xc6e00bf3 + W[28]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; - t1 = c + e1(h) + Ch(h, a, b) + 0xd5a79147 + W[29]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; - t1 = b + e1(g) + Ch(g, h, a) + 0x06ca6351 + W[30]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; - t1 = a + e1(f) + Ch(f, g, h) + 0x14292967 + W[31]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0x27b70a85 + W[32]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; - t1 = g + e1(d) + Ch(d, e, f) + 0x2e1b2138 + W[33]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; - t1 = f + e1(c) + Ch(c, d, e) + 0x4d2c6dfc + W[34]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; - t1 = e + e1(b) + Ch(b, c, d) + 0x53380d13 + W[35]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; - t1 = d + e1(a) + Ch(a, b, c) + 0x650a7354 + W[36]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; - t1 = c + e1(h) + Ch(h, a, b) + 0x766a0abb + W[37]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; - t1 = b + e1(g) + Ch(g, h, a) + 0x81c2c92e + W[38]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; - t1 = a + e1(f) + Ch(f, g, h) + 0x92722c85 + W[39]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0xa2bfe8a1 + W[40]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; - t1 = g + e1(d) + Ch(d, e, f) + 0xa81a664b + W[41]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; - t1 = f + e1(c) + Ch(c, d, e) + 0xc24b8b70 + W[42]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; - t1 = e + e1(b) + Ch(b, c, d) + 0xc76c51a3 + W[43]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; - t1 = d + e1(a) + Ch(a, b, c) + 0xd192e819 + W[44]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; - t1 = c + e1(h) + Ch(h, a, b) + 0xd6990624 + W[45]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; - t1 = b + e1(g) + Ch(g, h, a) + 0xf40e3585 + W[46]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; - t1 = a + e1(f) + Ch(f, g, h) + 0x106aa070 + W[47]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0x19a4c116 + W[48]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; - t1 = g + e1(d) + Ch(d, e, f) + 0x1e376c08 + W[49]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; - t1 = f + e1(c) + Ch(c, d, e) + 0x2748774c + W[50]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; - t1 = e + e1(b) + Ch(b, c, d) + 0x34b0bcb5 + W[51]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; - t1 = d + e1(a) + Ch(a, b, c) + 0x391c0cb3 + W[52]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; - t1 = c + e1(h) + Ch(h, a, b) + 0x4ed8aa4a + W[53]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; - t1 = b + e1(g) + Ch(g, h, a) + 0x5b9cca4f + W[54]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; - t1 = a + e1(f) + Ch(f, g, h) + 0x682e6ff3 + W[55]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0x748f82ee + W[56]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; - t1 = g + e1(d) + Ch(d, e, f) + 0x78a5636f + W[57]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; - t1 = f + e1(c) + Ch(c, d, e) + 0x84c87814 + W[58]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; - t1 = e + e1(b) + Ch(b, c, d) + 0x8cc70208 + W[59]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; - t1 = d + e1(a) + Ch(a, b, c) + 0x90befffa + W[60]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; - t1 = c + e1(h) + Ch(h, a, b) + 0xa4506ceb + W[61]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; - t1 = b + e1(g) + Ch(g, h, a) + 0xbef9a3f7 + W[62]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; - t1 = a + e1(f) + Ch(f, g, h) + 0xc67178f2 + W[63]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; - - state[0] += a; state[1] += b; state[2] += c; state[3] += d; - state[4] += e; state[5] += f; state[6] += g; state[7] += h; - - /* clear any sensitive info... */ - a = b = c = d = e = f = g = h = t1 = t2 = 0; - memset(W, 0, 64 * sizeof(u32)); -} - -int sha256_init(struct sha256_state *sctx) -{ - sctx->state[0] = SHA256_H0; - sctx->state[1] = SHA256_H1; - sctx->state[2] = SHA256_H2; - sctx->state[3] = SHA256_H3; - sctx->state[4] = SHA256_H4; - sctx->state[5] = SHA256_H5; - sctx->state[6] = SHA256_H6; - sctx->state[7] = SHA256_H7; - sctx->count = 0; - - return 0; -} - -int sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len) -{ - unsigned int partial, done; - const u8 *src; - - partial = sctx->count & 0x3f; - sctx->count += len; - done = 0; - src = data; - - if ((partial + len) > 63) { - if (partial) { - done = -partial; - memcpy(sctx->buf + partial, data, done + 64); - src = sctx->buf; - } - - do { - sha256_transform(sctx->state, src); - done += 64; - src = data + done; - } while (done + 63 < len); - - partial = 0; - } - memcpy(sctx->buf + partial, src, len - done); - - return 0; -} - -int sha256_final(struct sha256_state *sctx, u8 *out) -{ - __be32 *dst = (__be32 *)out; - __be64 bits; - unsigned int index, pad_len; - int i; - static const u8 padding[64] = { 0x80, }; - - /* Save number of bits */ - bits = cpu_to_be64(sctx->count << 3); - - /* Pad out to 56 mod 64. */ - index = sctx->count & 0x3f; - pad_len = (index < 56) ? (56 - index) : ((64+56) - index); - sha256_update(sctx, padding, pad_len); - - /* Append length (before padding) */ - sha256_update(sctx, (const u8 *)&bits, sizeof(bits)); - - /* Store state in digest */ - for (i = 0; i < 8; i++) - dst[i] = cpu_to_be32(sctx->state[i]); - - /* Zeroize sensitive information. */ - memset(sctx, 0, sizeof(*sctx)); - - return 0; -} -- cgit v1.2.3 From b128a30409356df65f1a51cff3eb986cac8cfedc Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Thu, 5 Sep 2019 21:40:21 -0400 Subject: padata: allocate workqueue internally Move workqueue allocation inside of padata to prepare for further changes to how padata uses workqueues. Guarantees the workqueue is created with max_active=1, which padata relies on to work correctly. No functional change. Signed-off-by: Daniel Jordan Acked-by: Steffen Klassert Cc: Herbert Xu Cc: Jonathan Corbet Cc: Lai Jiangshan Cc: Peter Zijlstra Cc: Tejun Heo Cc: linux-crypto@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- Documentation/padata.txt | 12 ++++++------ crypto/pcrypt.c | 13 ++----------- include/linux/padata.h | 3 +-- kernel/padata.c | 24 +++++++++++++++--------- 4 files changed, 24 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/Documentation/padata.txt b/Documentation/padata.txt index b103d0c82000..b37ba1eaace3 100644 --- a/Documentation/padata.txt +++ b/Documentation/padata.txt @@ -16,10 +16,12 @@ overall control of how tasks are to be run:: #include - struct padata_instance *padata_alloc(struct workqueue_struct *wq, + struct padata_instance *padata_alloc(const char *name, const struct cpumask *pcpumask, const struct cpumask *cbcpumask); +'name' simply identifies the instance. + The pcpumask describes which processors will be used to execute work submitted to this instance in parallel. The cbcpumask defines which processors are allowed to be used as the serialization callback processor. @@ -128,8 +130,7 @@ in that CPU mask or about a not running instance. Each task submitted to padata_do_parallel() will, in turn, be passed to exactly one call to the above-mentioned parallel() function, on one CPU, so -true parallelism is achieved by submitting multiple tasks. Despite the -fact that the workqueue is used to make these calls, parallel() is run with +true parallelism is achieved by submitting multiple tasks. parallel() runs with software interrupts disabled and thus cannot sleep. The parallel() function gets the padata_priv structure pointer as its lone parameter; information about the actual work to be done is probably obtained by using @@ -148,7 +149,7 @@ fact with a call to:: At some point in the future, padata_do_serial() will trigger a call to the serial() function in the padata_priv structure. That call will happen on the CPU requested in the initial call to padata_do_parallel(); it, too, is -done through the workqueue, but with local software interrupts disabled. +run with local software interrupts disabled. Note that this call may be deferred for a while since the padata code takes pains to ensure that tasks are completed in the order in which they were submitted. @@ -159,5 +160,4 @@ when a padata instance is no longer needed:: void padata_free(struct padata_instance *pinst); This function will busy-wait while any remaining tasks are completed, so it -might be best not to call it while there is work outstanding. Shutting -down the workqueue, if necessary, should be done separately. +might be best not to call it while there is work outstanding. diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c index 0edf5b54fc77..d67293063c7f 100644 --- a/crypto/pcrypt.c +++ b/crypto/pcrypt.c @@ -20,7 +20,6 @@ struct padata_pcrypt { struct padata_instance *pinst; - struct workqueue_struct *wq; /* * Cpumask for callback CPUs. It should be @@ -397,14 +396,9 @@ static int pcrypt_init_padata(struct padata_pcrypt *pcrypt, get_online_cpus(); - pcrypt->wq = alloc_workqueue("%s", WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE, - 1, name); - if (!pcrypt->wq) - goto err; - - pcrypt->pinst = padata_alloc_possible(pcrypt->wq); + pcrypt->pinst = padata_alloc_possible(name); if (!pcrypt->pinst) - goto err_destroy_workqueue; + goto err; mask = kmalloc(sizeof(*mask), GFP_KERNEL); if (!mask) @@ -437,8 +431,6 @@ err_free_cpumask: kfree(mask); err_free_padata: padata_free(pcrypt->pinst); -err_destroy_workqueue: - destroy_workqueue(pcrypt->wq); err: put_online_cpus(); @@ -452,7 +444,6 @@ static void pcrypt_fini_padata(struct padata_pcrypt *pcrypt) padata_stop(pcrypt->pinst); padata_unregister_cpumask_notifier(pcrypt->pinst, &pcrypt->nblock); - destroy_workqueue(pcrypt->wq); padata_free(pcrypt->pinst); } diff --git a/include/linux/padata.h b/include/linux/padata.h index 8da673861d99..839d9319920a 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -151,8 +151,7 @@ struct padata_instance { #define PADATA_INVALID 4 }; -extern struct padata_instance *padata_alloc_possible( - struct workqueue_struct *wq); +extern struct padata_instance *padata_alloc_possible(const char *name); extern void padata_free(struct padata_instance *pinst); extern int padata_do_parallel(struct padata_instance *pinst, struct padata_priv *padata, int cb_cpu); diff --git a/kernel/padata.c b/kernel/padata.c index b60cc3dcee58..58728cd7f40c 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -805,6 +805,7 @@ static void __padata_free(struct padata_instance *pinst) padata_free_pd(pinst->pd); free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); + destroy_workqueue(pinst->wq); kfree(pinst); } @@ -938,13 +939,13 @@ static struct kobj_type padata_attr_type = { * padata_alloc - allocate and initialize a padata instance and specify * cpumasks for serial and parallel workers. * - * @wq: workqueue to use for the allocated padata instance + * @name: used to identify the instance * @pcpumask: cpumask that will be used for padata parallelization * @cbcpumask: cpumask that will be used for padata serialization * * Must be called from a cpus_read_lock() protected region */ -static struct padata_instance *padata_alloc(struct workqueue_struct *wq, +static struct padata_instance *padata_alloc(const char *name, const struct cpumask *pcpumask, const struct cpumask *cbcpumask) { @@ -955,11 +956,16 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq, if (!pinst) goto err; - if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) + pinst->wq = alloc_workqueue("%s", WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE, + 1, name); + if (!pinst->wq) goto err_free_inst; + + if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) + goto err_free_wq; if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { free_cpumask_var(pinst->cpumask.pcpu); - goto err_free_inst; + goto err_free_wq; } if (!padata_validate_cpumask(pinst, pcpumask) || !padata_validate_cpumask(pinst, cbcpumask)) @@ -971,8 +977,6 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq, rcu_assign_pointer(pinst->pd, pd); - pinst->wq = wq; - cpumask_copy(pinst->cpumask.pcpu, pcpumask); cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); @@ -990,6 +994,8 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq, err_free_masks: free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); +err_free_wq: + destroy_workqueue(pinst->wq); err_free_inst: kfree(pinst); err: @@ -1001,14 +1007,14 @@ err: * Use the cpu_possible_mask for serial and * parallel workers. * - * @wq: workqueue to use for the allocated padata instance + * @name: used to identify the instance * * Must be called from a cpus_read_lock() protected region */ -struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) +struct padata_instance *padata_alloc_possible(const char *name) { lockdep_assert_cpus_held(); - return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); + return padata_alloc(name, cpu_possible_mask, cpu_possible_mask); } EXPORT_SYMBOL(padata_alloc_possible); -- cgit v1.2.3 From 513c98d08682957cc9eba20e7e4bb349970711f3 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Thu, 5 Sep 2019 21:40:22 -0400 Subject: workqueue: unconfine alloc/apply/free_workqueue_attrs() padata will use these these interfaces in a later patch, so unconfine them. Signed-off-by: Daniel Jordan Acked-by: Tejun Heo Acked-by: Steffen Klassert Cc: Herbert Xu Cc: Lai Jiangshan Cc: Peter Zijlstra Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- include/linux/workqueue.h | 4 ++++ kernel/workqueue.c | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index b7c585b5ec1c..4261d1c6e87b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -435,6 +435,10 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, extern void destroy_workqueue(struct workqueue_struct *wq); +struct workqueue_attrs *alloc_workqueue_attrs(void); +void free_workqueue_attrs(struct workqueue_attrs *attrs); +int apply_workqueue_attrs(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs); int workqueue_set_unbound_cpumask(cpumask_var_t cpumask); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 601d61150b65..f53705ff3ff1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3329,7 +3329,7 @@ EXPORT_SYMBOL_GPL(execute_in_process_context); * * Undo alloc_workqueue_attrs(). */ -static void free_workqueue_attrs(struct workqueue_attrs *attrs) +void free_workqueue_attrs(struct workqueue_attrs *attrs) { if (attrs) { free_cpumask_var(attrs->cpumask); @@ -3345,7 +3345,7 @@ static void free_workqueue_attrs(struct workqueue_attrs *attrs) * * Return: The allocated new workqueue_attr on success. %NULL on failure. */ -static struct workqueue_attrs *alloc_workqueue_attrs(void) +struct workqueue_attrs *alloc_workqueue_attrs(void) { struct workqueue_attrs *attrs; @@ -4032,7 +4032,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, * * Return: 0 on success and -errno on failure. */ -static int apply_workqueue_attrs(struct workqueue_struct *wq, +int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { int ret; -- cgit v1.2.3 From e6ce0e0807e90d38a2cefa524ac253d7a85c3f2f Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Thu, 5 Sep 2019 21:40:24 -0400 Subject: padata: make padata_do_parallel find alternate callback CPU padata_do_parallel currently returns -EINVAL if the callback CPU isn't in the callback cpumask. pcrypt tries to prevent this situation by keeping its own callback cpumask in sync with padata's and checks that the callback CPU it passes to padata is valid. Make padata handle this instead. padata_do_parallel now takes a pointer to the callback CPU and updates it for the caller if an alternate CPU is used. Overall behavior in terms of which callback CPUs are chosen stays the same. Prepares for removal of the padata cpumask notifier in pcrypt, which will fix a lockdep complaint about nested acquisition of the CPU hotplug lock later in the series. Signed-off-by: Daniel Jordan Acked-by: Steffen Klassert Cc: Herbert Xu Cc: Lai Jiangshan Cc: Peter Zijlstra Cc: Tejun Heo Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- crypto/pcrypt.c | 33 ++------------------------------- include/linux/padata.h | 2 +- kernel/padata.c | 27 ++++++++++++++++++++------- 3 files changed, 23 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c index d67293063c7f..efca962ab12a 100644 --- a/crypto/pcrypt.c +++ b/crypto/pcrypt.c @@ -57,35 +57,6 @@ struct pcrypt_aead_ctx { unsigned int cb_cpu; }; -static int pcrypt_do_parallel(struct padata_priv *padata, unsigned int *cb_cpu, - struct padata_pcrypt *pcrypt) -{ - unsigned int cpu_index, cpu, i; - struct pcrypt_cpumask *cpumask; - - cpu = *cb_cpu; - - rcu_read_lock_bh(); - cpumask = rcu_dereference_bh(pcrypt->cb_cpumask); - if (cpumask_test_cpu(cpu, cpumask->mask)) - goto out; - - if (!cpumask_weight(cpumask->mask)) - goto out; - - cpu_index = cpu % cpumask_weight(cpumask->mask); - - cpu = cpumask_first(cpumask->mask); - for (i = 0; i < cpu_index; i++) - cpu = cpumask_next(cpu, cpumask->mask); - - *cb_cpu = cpu; - -out: - rcu_read_unlock_bh(); - return padata_do_parallel(pcrypt->pinst, padata, cpu); -} - static int pcrypt_aead_setkey(struct crypto_aead *parent, const u8 *key, unsigned int keylen) { @@ -157,7 +128,7 @@ static int pcrypt_aead_encrypt(struct aead_request *req) req->cryptlen, req->iv); aead_request_set_ad(creq, req->assoclen); - err = pcrypt_do_parallel(padata, &ctx->cb_cpu, &pencrypt); + err = padata_do_parallel(pencrypt.pinst, padata, &ctx->cb_cpu); if (!err) return -EINPROGRESS; @@ -199,7 +170,7 @@ static int pcrypt_aead_decrypt(struct aead_request *req) req->cryptlen, req->iv); aead_request_set_ad(creq, req->assoclen); - err = pcrypt_do_parallel(padata, &ctx->cb_cpu, &pdecrypt); + err = padata_do_parallel(pdecrypt.pinst, padata, &ctx->cb_cpu); if (!err) return -EINPROGRESS; diff --git a/include/linux/padata.h b/include/linux/padata.h index 839d9319920a..f7851f8e2190 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -154,7 +154,7 @@ struct padata_instance { extern struct padata_instance *padata_alloc_possible(const char *name); extern void padata_free(struct padata_instance *pinst); extern int padata_do_parallel(struct padata_instance *pinst, - struct padata_priv *padata, int cb_cpu); + struct padata_priv *padata, int *cb_cpu); extern void padata_do_serial(struct padata_priv *padata); extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, cpumask_var_t cpumask); diff --git a/kernel/padata.c b/kernel/padata.c index 58728cd7f40c..9a17922ec436 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -94,17 +94,19 @@ static void padata_parallel_worker(struct work_struct *parallel_work) * * @pinst: padata instance * @padata: object to be parallelized - * @cb_cpu: cpu the serialization callback function will run on, - * must be in the serial cpumask of padata(i.e. cpumask.cbcpu). + * @cb_cpu: pointer to the CPU that the serialization callback function should + * run on. If it's not in the serial cpumask of @pinst + * (i.e. cpumask.cbcpu), this function selects a fallback CPU and if + * none found, returns -EINVAL. * * The parallelization callback function will run with BHs off. * Note: Every object which is parallelized by padata_do_parallel * must be seen by padata_do_serial. */ int padata_do_parallel(struct padata_instance *pinst, - struct padata_priv *padata, int cb_cpu) + struct padata_priv *padata, int *cb_cpu) { - int target_cpu, err; + int i, cpu, cpu_index, target_cpu, err; struct padata_parallel_queue *queue; struct parallel_data *pd; @@ -116,8 +118,19 @@ int padata_do_parallel(struct padata_instance *pinst, if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID) goto out; - if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu)) - goto out; + if (!cpumask_test_cpu(*cb_cpu, pd->cpumask.cbcpu)) { + if (!cpumask_weight(pd->cpumask.cbcpu)) + goto out; + + /* Select an alternate fallback CPU and notify the caller. */ + cpu_index = *cb_cpu % cpumask_weight(pd->cpumask.cbcpu); + + cpu = cpumask_first(pd->cpumask.cbcpu); + for (i = 0; i < cpu_index; i++) + cpu = cpumask_next(cpu, pd->cpumask.cbcpu); + + *cb_cpu = cpu; + } err = -EBUSY; if ((pinst->flags & PADATA_RESET)) @@ -129,7 +142,7 @@ int padata_do_parallel(struct padata_instance *pinst, err = 0; atomic_inc(&pd->refcnt); padata->pd = pd; - padata->cb_cpu = cb_cpu; + padata->cb_cpu = *cb_cpu; target_cpu = padata_cpu_hash(pd); padata->cpu = target_cpu; -- cgit v1.2.3 From 45d153c08bc73c8ced640dc20d8f2b749a6cb0d0 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Thu, 5 Sep 2019 21:40:27 -0400 Subject: padata: use separate workqueues for parallel and serial work padata currently uses one per-CPU workqueue per instance for all work. Prepare for running parallel jobs on an unbound workqueue by introducing dedicated workqueues for parallel and serial work. Signed-off-by: Daniel Jordan Acked-by: Steffen Klassert Cc: Herbert Xu Cc: Lai Jiangshan Cc: Peter Zijlstra Cc: Tejun Heo Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- include/linux/padata.h | 6 ++++-- kernel/padata.c | 28 ++++++++++++++++++---------- 2 files changed, 22 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/padata.h b/include/linux/padata.h index f7851f8e2190..e7978f8942ca 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -127,7 +127,8 @@ struct parallel_data { * struct padata_instance - The overall control structure. * * @cpu_notifier: cpu hotplug notifier. - * @wq: The workqueue in use. + * @parallel_wq: The workqueue used for parallel work. + * @serial_wq: The workqueue used for serial work. * @pd: The internal control structure. * @cpumask: User supplied cpumasks for parallel and serial works. * @cpumask_change_notifier: Notifiers chain for user-defined notify @@ -139,7 +140,8 @@ struct parallel_data { */ struct padata_instance { struct hlist_node node; - struct workqueue_struct *wq; + struct workqueue_struct *parallel_wq; + struct workqueue_struct *serial_wq; struct parallel_data *pd; struct padata_cpumask cpumask; struct blocking_notifier_head cpumask_change_notifier; diff --git a/kernel/padata.c b/kernel/padata.c index 8a362923c488..669f5d53d357 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -152,7 +152,7 @@ int padata_do_parallel(struct padata_instance *pinst, list_add_tail(&padata->list, &queue->parallel.list); spin_unlock(&queue->parallel.lock); - queue_work_on(target_cpu, pinst->wq, &queue->work); + queue_work_on(target_cpu, pinst->parallel_wq, &queue->work); out: rcu_read_unlock_bh(); @@ -261,7 +261,7 @@ static void padata_reorder(struct parallel_data *pd) list_add_tail(&padata->list, &squeue->serial.list); spin_unlock(&squeue->serial.lock); - queue_work_on(cb_cpu, pinst->wq, &squeue->work); + queue_work_on(cb_cpu, pinst->serial_wq, &squeue->work); } spin_unlock_bh(&pd->lock); @@ -278,7 +278,7 @@ static void padata_reorder(struct parallel_data *pd) next_queue = per_cpu_ptr(pd->pqueue, pd->cpu); if (!list_empty(&next_queue->reorder.list)) - queue_work(pinst->wq, &pd->reorder_work); + queue_work(pinst->serial_wq, &pd->reorder_work); } static void invoke_padata_reorder(struct work_struct *work) @@ -818,7 +818,8 @@ static void __padata_free(struct padata_instance *pinst) padata_free_pd(pinst->pd); free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); - destroy_workqueue(pinst->wq); + destroy_workqueue(pinst->serial_wq); + destroy_workqueue(pinst->parallel_wq); kfree(pinst); } @@ -967,18 +968,23 @@ static struct padata_instance *padata_alloc(const char *name, if (!pinst) goto err; - pinst->wq = alloc_workqueue("%s", WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE, - 1, name); - if (!pinst->wq) + pinst->parallel_wq = alloc_workqueue("%s_parallel", WQ_MEM_RECLAIM | + WQ_CPU_INTENSIVE, 1, name); + if (!pinst->parallel_wq) goto err_free_inst; get_online_cpus(); - if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) + pinst->serial_wq = alloc_workqueue("%s_serial", WQ_MEM_RECLAIM | + WQ_CPU_INTENSIVE, 1, name); + if (!pinst->serial_wq) goto err_put_cpus; + + if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) + goto err_free_serial_wq; if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { free_cpumask_var(pinst->cpumask.pcpu); - goto err_put_cpus; + goto err_free_serial_wq; } if (!padata_validate_cpumask(pinst, pcpumask) || !padata_validate_cpumask(pinst, cbcpumask)) @@ -1010,9 +1016,11 @@ static struct padata_instance *padata_alloc(const char *name, err_free_masks: free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); +err_free_serial_wq: + destroy_workqueue(pinst->serial_wq); err_put_cpus: put_online_cpus(); - destroy_workqueue(pinst->wq); + destroy_workqueue(pinst->parallel_wq); err_free_inst: kfree(pinst); err: -- cgit v1.2.3 From bfde23ce200e6d33291d29b9b8b60cc2f30f0805 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Thu, 5 Sep 2019 21:40:28 -0400 Subject: padata: unbind parallel jobs from specific CPUs Padata binds the parallel part of a job to a single CPU and round-robins over all CPUs in the system for each successive job. Though the serial parts rely on per-CPU queues for correct ordering, they're not necessary for parallel work, and it improves performance to run the job locally on NUMA machines and let the scheduler pick the CPU within a node on a busy system. So, make the parallel workqueue unbound. Update the parallel workqueue's cpumask when the instance's parallel cpumask changes. Now that parallel jobs no longer run on max_active=1 workqueues, two or more parallel works that hash to the same CPU may run simultaneously, finish out of order, and so be serialized out of order. Prevent this by keeping the works sorted on the reorder list by sequence number and checking that in the reordering logic. padata_get_next becomes padata_find_next so it can be reused for the end of padata_reorder, where it's used to avoid uselessly queueing work when the next job by sequence number isn't finished yet but a later job that hashed to the same CPU has. The ENODATA case in padata_find_next no longer makes sense because parallel jobs aren't bound to specific CPUs. The EINPROGRESS case takes care of the scenario where a parallel job is potentially running on the same CPU as padata_find_next, and with only one error code left, just use NULL instead. Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Lai Jiangshan Cc: Peter Zijlstra Cc: Steffen Klassert Cc: Tejun Heo Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- include/linux/padata.h | 3 ++ kernel/padata.c | 118 +++++++++++++++++++++++++++---------------------- 2 files changed, 68 insertions(+), 53 deletions(-) (limited to 'include/linux') diff --git a/include/linux/padata.h b/include/linux/padata.h index e7978f8942ca..43d3fd9d17fc 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -35,6 +35,7 @@ struct padata_priv { struct parallel_data *pd; int cb_cpu; int cpu; + unsigned int seq_nr; int info; void (*parallel)(struct padata_priv *padata); void (*serial)(struct padata_priv *padata); @@ -105,6 +106,7 @@ struct padata_cpumask { * @reorder_objects: Number of objects waiting in the reorder queues. * @refcnt: Number of objects holding a reference on this parallel_data. * @max_seq_nr: Maximal used sequence number. + * @processed: Number of already processed objects. * @cpu: Next CPU to be processed. * @cpumask: The cpumasks in use for parallel and serial workers. * @reorder_work: work struct for reordering. @@ -117,6 +119,7 @@ struct parallel_data { atomic_t reorder_objects; atomic_t refcnt; atomic_t seq_nr; + unsigned int processed; int cpu; struct padata_cpumask cpumask; struct work_struct reorder_work; diff --git a/kernel/padata.c b/kernel/padata.c index 669f5d53d357..832224dcf2e1 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -46,18 +46,13 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) return target_cpu; } -static int padata_cpu_hash(struct parallel_data *pd) +static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr) { - unsigned int seq_nr; - int cpu_index; - /* * Hash the sequence numbers to the cpus by taking * seq_nr mod. number of cpus in use. */ - - seq_nr = atomic_inc_return(&pd->seq_nr); - cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu); + int cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu); return padata_index_to_cpu(pd, cpu_index); } @@ -144,7 +139,8 @@ int padata_do_parallel(struct padata_instance *pinst, padata->pd = pd; padata->cb_cpu = *cb_cpu; - target_cpu = padata_cpu_hash(pd); + padata->seq_nr = atomic_inc_return(&pd->seq_nr); + target_cpu = padata_cpu_hash(pd, padata->seq_nr); padata->cpu = target_cpu; queue = per_cpu_ptr(pd->pqueue, target_cpu); @@ -152,7 +148,7 @@ int padata_do_parallel(struct padata_instance *pinst, list_add_tail(&padata->list, &queue->parallel.list); spin_unlock(&queue->parallel.lock); - queue_work_on(target_cpu, pinst->parallel_wq, &queue->work); + queue_work(pinst->parallel_wq, &queue->work); out: rcu_read_unlock_bh(); @@ -162,21 +158,19 @@ out: EXPORT_SYMBOL(padata_do_parallel); /* - * padata_get_next - Get the next object that needs serialization. + * padata_find_next - Find the next object that needs serialization. * * Return values are: * * A pointer to the control struct of the next object that needs * serialization, if present in one of the percpu reorder queues. * - * -EINPROGRESS, if the next object that needs serialization will + * NULL, if the next object that needs serialization will * be parallel processed by another cpu and is not yet present in * the cpu's reorder queue. - * - * -ENODATA, if this cpu has to do the parallel processing for - * the next object. */ -static struct padata_priv *padata_get_next(struct parallel_data *pd) +static struct padata_priv *padata_find_next(struct parallel_data *pd, + bool remove_object) { struct padata_parallel_queue *next_queue; struct padata_priv *padata; @@ -187,28 +181,30 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) reorder = &next_queue->reorder; spin_lock(&reorder->lock); - if (!list_empty(&reorder->list)) { - padata = list_entry(reorder->list.next, - struct padata_priv, list); - - list_del_init(&padata->list); - atomic_dec(&pd->reorder_objects); + if (list_empty(&reorder->list)) { + spin_unlock(&reorder->lock); + return NULL; + } - pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, - false); + padata = list_entry(reorder->list.next, struct padata_priv, list); + /* + * Checks the rare case where two or more parallel jobs have hashed to + * the same CPU and one of the later ones finishes first. + */ + if (padata->seq_nr != pd->processed) { spin_unlock(&reorder->lock); - goto out; + return NULL; } - spin_unlock(&reorder->lock); - if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) { - padata = ERR_PTR(-ENODATA); - goto out; + if (remove_object) { + list_del_init(&padata->list); + atomic_dec(&pd->reorder_objects); + ++pd->processed; + pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, false); } - padata = ERR_PTR(-EINPROGRESS); -out: + spin_unlock(&reorder->lock); return padata; } @@ -234,26 +230,16 @@ static void padata_reorder(struct parallel_data *pd) return; while (1) { - padata = padata_get_next(pd); + padata = padata_find_next(pd, true); /* * If the next object that needs serialization is parallel * processed by another cpu and is still on it's way to the * cpu's reorder queue, nothing to do for now. */ - if (PTR_ERR(padata) == -EINPROGRESS) + if (!padata) break; - /* - * This cpu has to do the parallel processing of the next - * object. It's waiting in the cpu's parallelization queue, - * so exit immediately. - */ - if (PTR_ERR(padata) == -ENODATA) { - spin_unlock_bh(&pd->lock); - return; - } - cb_cpu = padata->cb_cpu; squeue = per_cpu_ptr(pd->squeue, cb_cpu); @@ -277,7 +263,8 @@ static void padata_reorder(struct parallel_data *pd) smp_mb(); next_queue = per_cpu_ptr(pd->pqueue, pd->cpu); - if (!list_empty(&next_queue->reorder.list)) + if (!list_empty(&next_queue->reorder.list) && + padata_find_next(pd, false)) queue_work(pinst->serial_wq, &pd->reorder_work); } @@ -332,9 +319,14 @@ void padata_do_serial(struct padata_priv *padata) struct parallel_data *pd = padata->pd; struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue, padata->cpu); + struct padata_priv *cur; spin_lock(&pqueue->reorder.lock); - list_add_tail(&padata->list, &pqueue->reorder.list); + /* Sort in ascending order of sequence number. */ + list_for_each_entry_reverse(cur, &pqueue->reorder.list, list) + if (cur->seq_nr < padata->seq_nr) + break; + list_add(&padata->list, &cur->list); atomic_inc(&pd->reorder_objects); spin_unlock(&pqueue->reorder.lock); @@ -353,17 +345,36 @@ static int padata_setup_cpumasks(struct parallel_data *pd, const struct cpumask *pcpumask, const struct cpumask *cbcpumask) { - if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) - return -ENOMEM; + struct workqueue_attrs *attrs; + int err = -ENOMEM; + if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) + goto out; cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask); - if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { - free_cpumask_var(pd->cpumask.pcpu); - return -ENOMEM; - } + if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) + goto free_pcpu_mask; cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask); + + attrs = alloc_workqueue_attrs(); + if (!attrs) + goto free_cbcpu_mask; + + /* Restrict parallel_wq workers to pd->cpumask.pcpu. */ + cpumask_copy(attrs->cpumask, pd->cpumask.pcpu); + err = apply_workqueue_attrs(pd->pinst->parallel_wq, attrs); + free_workqueue_attrs(attrs); + if (err < 0) + goto free_cbcpu_mask; + return 0; + +free_cbcpu_mask: + free_cpumask_var(pd->cpumask.cbcpu); +free_pcpu_mask: + free_cpumask_var(pd->cpumask.pcpu); +out: + return err; } static void __padata_list_init(struct padata_list *pd_list) @@ -429,6 +440,8 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, pd->squeue = alloc_percpu(struct padata_serial_queue); if (!pd->squeue) goto err_free_pqueue; + + pd->pinst = pinst; if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0) goto err_free_squeue; @@ -437,7 +450,6 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, atomic_set(&pd->seq_nr, -1); atomic_set(&pd->reorder_objects, 0); atomic_set(&pd->refcnt, 0); - pd->pinst = pinst; spin_lock_init(&pd->lock); pd->cpu = cpumask_first(pd->cpumask.pcpu); INIT_WORK(&pd->reorder_work, invoke_padata_reorder); @@ -968,8 +980,8 @@ static struct padata_instance *padata_alloc(const char *name, if (!pinst) goto err; - pinst->parallel_wq = alloc_workqueue("%s_parallel", WQ_MEM_RECLAIM | - WQ_CPU_INTENSIVE, 1, name); + pinst->parallel_wq = alloc_workqueue("%s_parallel", WQ_UNBOUND, 0, + name); if (!pinst->parallel_wq) goto err_free_inst; -- cgit v1.2.3 From c51636a3065491af521187724d14a822548bcfd7 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Thu, 5 Sep 2019 21:40:29 -0400 Subject: padata: remove cpu_index from the parallel_queue With the removal of the ENODATA case from padata_get_next, the cpu_index field is no longer useful, so it can go away. Signed-off-by: Daniel Jordan Acked-by: Steffen Klassert Cc: Herbert Xu Cc: Lai Jiangshan Cc: Peter Zijlstra Cc: Tejun Heo Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- include/linux/padata.h | 2 -- kernel/padata.c | 13 ++----------- 2 files changed, 2 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/padata.h b/include/linux/padata.h index 43d3fd9d17fc..23717eeaad23 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -75,14 +75,12 @@ struct padata_serial_queue { * @swork: work struct for serialization. * @work: work struct for parallelization. * @num_obj: Number of objects that are processed by this cpu. - * @cpu_index: Index of the cpu. */ struct padata_parallel_queue { struct padata_list parallel; struct padata_list reorder; struct work_struct work; atomic_t num_obj; - int cpu_index; }; /** diff --git a/kernel/padata.c b/kernel/padata.c index 832224dcf2e1..c3fec1413295 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -400,21 +400,12 @@ static void padata_init_squeues(struct parallel_data *pd) /* Initialize all percpu queues used by parallel workers */ static void padata_init_pqueues(struct parallel_data *pd) { - int cpu_index, cpu; + int cpu; struct padata_parallel_queue *pqueue; - cpu_index = 0; - for_each_possible_cpu(cpu) { + for_each_cpu(cpu, pd->cpumask.pcpu) { pqueue = per_cpu_ptr(pd->pqueue, cpu); - if (!cpumask_test_cpu(cpu, pd->cpumask.pcpu)) { - pqueue->cpu_index = -1; - continue; - } - - pqueue->cpu_index = cpu_index; - cpu_index++; - __padata_list_init(&pqueue->reorder); __padata_list_init(&pqueue->parallel); INIT_WORK(&pqueue->work, padata_parallel_worker); -- cgit v1.2.3