diff options
author | Michal Marek <mmarek@suse.cz> | 2011-03-09 16:15:44 +0100 |
---|---|---|
committer | Michal Marek <mmarek@suse.cz> | 2011-03-09 16:15:44 +0100 |
commit | 2d8ad8719591fa803b0d589ed057fa46f49b7155 (patch) | |
tree | 4ae051577dad1161c91dafbf4207bb10a9dc91bb /lib | |
parent | 9b4ce7bce5f30712fd926ab4599a803314a07719 (diff) | |
parent | c56eb8fb6dccb83d9fe62fd4dc00c834de9bc470 (diff) | |
download | blackbird-op-linux-2d8ad8719591fa803b0d589ed057fa46f49b7155.tar.gz blackbird-op-linux-2d8ad8719591fa803b0d589ed057fa46f49b7155.zip |
Merge commit 'v2.6.38-rc1' into kbuild/packaging
Diffstat (limited to 'lib')
85 files changed, 8913 insertions, 1419 deletions
diff --git a/lib/Kconfig b/lib/Kconfig index 97b136ff117e..0ee67e08ad3e 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -7,6 +7,9 @@ config BINARY_PRINTF menu "Library routines" +config RAID6_PQ + tristate + config BITREVERSE tristate @@ -103,6 +106,8 @@ config LZO_COMPRESS config LZO_DECOMPRESS tristate +source "lib/xz/Kconfig" + # # These all provide a common interface (hence the apparent duplication with # ZLIB_INFLATE; DECOMPRESS_GZIP is just a wrapper.) @@ -117,6 +122,10 @@ config DECOMPRESS_BZIP2 config DECOMPRESS_LZMA tristate +config DECOMPRESS_XZ + select XZ_DEC + tristate + config DECOMPRESS_LZO select LZO_DECOMPRESS tristate @@ -160,6 +169,9 @@ config TEXTSEARCH_BM config TEXTSEARCH_FSM tristate +config BTREE + boolean + config HAS_IOMEM boolean depends on !NO_IOMEM @@ -178,9 +190,6 @@ config HAS_DMA config CHECK_SIGNATURE bool -config HAVE_LMB - boolean - config CPUMASK_OFFSTACK bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS help @@ -207,4 +216,7 @@ config GENERIC_ATOMIC64 config LRU_CACHE tristate +config AVERAGE + bool + endmenu diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 25c3ed594c54..2d05adb98401 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -76,7 +76,6 @@ config UNUSED_SYMBOLS config DEBUG_FS bool "Debug Filesystem" - depends on SYSFS help debugfs is a virtual file system that kernel developers use to put debugging files into. Enable this option to be able to read and @@ -103,7 +102,8 @@ config HEADERS_CHECK config DEBUG_SECTION_MISMATCH bool "Enable full Section mismatch analysis" - depends on UNDEFINED + depends on UNDEFINED || (BLACKFIN) + default y # This option is on purpose disabled for now. # It will be enabled when we are down to a reasonable number # of section mismatch warnings (< 10 for an allyesconfig build) @@ -151,28 +151,34 @@ config DEBUG_SHIRQ Drivers ought to be able to handle interrupts coming in at those points; some don't and need to be caught. -config DETECT_SOFTLOCKUP - bool "Detect Soft Lockups" +config LOCKUP_DETECTOR + bool "Detect Hard and Soft Lockups" depends on DEBUG_KERNEL && !S390 - default y help - Say Y here to enable the kernel to detect "soft lockups", - which are bugs that cause the kernel to loop in kernel + Say Y here to enable the kernel to act as a watchdog to detect + hard and soft lockups. + + Softlockups are bugs that cause the kernel to loop in kernel mode for more than 60 seconds, without giving other tasks a - chance to run. + chance to run. The current stack trace is displayed upon + detection and the system will stay locked up. - When a soft-lockup is detected, the kernel will print the - current stack trace (which you should report), but the - system will stay locked up. This feature has negligible - overhead. + Hardlockups are bugs that cause the CPU to loop in kernel mode + for more than 60 seconds, without letting other interrupts have a + chance to run. The current stack trace is displayed upon detection + and the system will stay locked up. + + The overhead should be minimal. A periodic hrtimer runs to + generate interrupts and kick the watchdog task every 10-12 seconds. + An NMI is generated every 60 seconds or so to check for hardlockups. - (Note that "hard lockups" are separate type of bugs that - can be detected via the NMI-watchdog, on platforms that - support it.) +config HARDLOCKUP_DETECTOR + def_bool LOCKUP_DETECTOR && PERF_EVENTS && HAVE_PERF_EVENTS_NMI && \ + !ARCH_HAS_NMI_WATCHDOG config BOOTPARAM_SOFTLOCKUP_PANIC bool "Panic (Reboot) On Soft Lockups" - depends on DETECT_SOFTLOCKUP + depends on LOCKUP_DETECTOR help Say Y here to enable the kernel to panic on "soft lockups", which are bugs that cause the kernel to loop in kernel @@ -189,7 +195,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE int - depends on DETECT_SOFTLOCKUP + depends on LOCKUP_DETECTOR range 0 1 default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC default 1 if BOOTPARAM_SOFTLOCKUP_PANIC @@ -306,6 +312,20 @@ config DEBUG_OBJECTS_WORK work queue routines to track the life time of work objects and validate the work operations. +config DEBUG_OBJECTS_RCU_HEAD + bool "Debug RCU callbacks objects" + depends on DEBUG_OBJECTS && PREEMPT + help + Enable this to turn on debugging of RCU list heads (call_rcu() usage). + +config DEBUG_OBJECTS_PERCPU_COUNTER + bool "Debug percpu counter objects" + depends on DEBUG_OBJECTS + help + If you say Y here, additional code will be inserted into the + percpu counter routines to track the life time of percpu counter + objects and validate the percpu counter operations. + config DEBUG_OBJECTS_ENABLE_DEFAULT int "debug_objects bootup default value (0-1)" range 0 1 @@ -342,7 +362,7 @@ config SLUB_DEBUG_ON config SLUB_STATS default n bool "Enable SLUB performance statistics" - depends on SLUB && SLUB_DEBUG && SYSFS + depends on SLUB && SYSFS help SLUB statistics are useful to debug SLUBs allocation behavior in order find ways to optimize the allocator. This should never be @@ -355,7 +375,7 @@ config SLUB_STATS config DEBUG_KMEMLEAK bool "Kernel memory leak detector" depends on DEBUG_KERNEL && EXPERIMENTAL && !MEMORY_HOTPLUG && \ - (X86 || ARM || PPC || S390) + (X86 || ARM || PPC || S390 || SPARC64 || SUPERH || MICROBLAZE || TILE) select DEBUG_FS if SYSFS select STACKTRACE if STACKTRACE_SUPPORT @@ -399,6 +419,13 @@ config DEBUG_KMEMLEAK_TEST If unsure, say N. +config DEBUG_KMEMLEAK_DEFAULT_OFF + bool "Default kmemleak to off" + depends on DEBUG_KMEMLEAK + help + Say Y here to disable kmemleak by default. It can then be enabled + on the command line via kmemleak=on. + config DEBUG_PREEMPT bool "Debug preemptible kernel" depends on DEBUG_KERNEL && PREEMPT && TRACE_IRQFLAGS_SUPPORT @@ -443,6 +470,15 @@ config DEBUG_MUTEXES This feature allows mutex semantics violations to be detected and reported. +config BKL + bool "Big Kernel Lock" if (SMP || PREEMPT) + default y + help + This is the traditional lock that is used in old code instead + of proper locking. All drivers that use the BKL should depend + on this symbol. + Say Y here unless you are working on removing the BKL. + config DEBUG_LOCK_ALLOC bool "Lock debugging: detect incorrect freeing of live locks" depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT @@ -464,6 +500,7 @@ config PROVE_LOCKING select DEBUG_SPINLOCK select DEBUG_MUTEXES select DEBUG_LOCK_ALLOC + select TRACE_IRQFLAGS default n help This feature enables the kernel to prove that all locking @@ -499,11 +536,52 @@ config PROVE_LOCKING For more details, see Documentation/lockdep-design.txt. +config PROVE_RCU + bool "RCU debugging: prove RCU correctness" + depends on PROVE_LOCKING + default n + help + This feature enables lockdep extensions that check for correct + use of RCU APIs. This is currently under development. Say Y + if you want to debug RCU usage or help work on the PROVE_RCU + feature. + + Say N if you are unsure. + +config PROVE_RCU_REPEATEDLY + bool "RCU debugging: don't disable PROVE_RCU on first splat" + depends on PROVE_RCU + default n + help + By itself, PROVE_RCU will disable checking upon issuing the + first warning (or "splat"). This feature prevents such + disabling, allowing multiple RCU-lockdep warnings to be printed + on a single reboot. + + Say Y to allow multiple RCU-lockdep warnings per boot. + + Say N if you are unsure. + +config SPARSE_RCU_POINTER + bool "RCU debugging: sparse-based checks for pointer usage" + default n + help + This feature enables the __rcu sparse annotation for + RCU-protected pointers. This annotation will cause sparse + to flag any non-RCU used of annotated pointers. This can be + helpful when debugging RCU usage. Please note that this feature + is not intended to enforce code cleanliness; it is instead merely + a debugging aid. + + Say Y to make sparse flag questionable use of RCU-protected pointers + + Say N if you are unsure. + config LOCKDEP bool depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT select STACKTRACE - select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 + select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE select KALLSYMS select KALLSYMS_ALL @@ -520,6 +598,14 @@ config LOCK_STAT For more details, see Documentation/lockstat.txt + This also enables lock events required by "perf lock", + subcommand of perf. + If you want to use "perf lock", you also need to turn on + CONFIG_EVENT_TRACING. + + CONFIG_LOCK_STAT defines "contended" and "acquired" lock events. + (CONFIG_LOCKDEP defines "acquire" and "release" events.) + config DEBUG_LOCKDEP bool "Lock dependency engine debugging" depends on DEBUG_KERNEL && LOCKDEP @@ -529,11 +615,10 @@ config DEBUG_LOCKDEP of more runtime overhead. config TRACE_IRQFLAGS - depends on DEBUG_KERNEL bool - default y - depends on TRACE_IRQFLAGS_SUPPORT - depends on PROVE_LOCKING + help + Enables hooks to interrupt enabling and disabling for + either tracing or lock debugging. config DEBUG_SPINLOCK_SLEEP bool "Spinlock debugging: sleep-inside-spinlock checking" @@ -595,6 +680,19 @@ config DEBUG_INFO If unsure, say N. +config DEBUG_INFO_REDUCED + bool "Reduce debugging information" + depends on DEBUG_INFO + help + If you say Y here gcc is instructed to generate less debugging + information for structure types. This means that tools that + need full debugging information (like kgdb or systemtap) won't + be happy. But if you merely need debugging information to + resolve line numbers there is no loss. Advantage is that + build directory object sizes shrink dramatically over a full + DEBUG_INFO build and compile times are reduced too. + Only works with newer gcc versions. + config DEBUG_VM bool "Debug VM" depends on DEBUG_KERNEL @@ -651,6 +749,15 @@ config DEBUG_LIST If unsure, say N. +config TEST_LIST_SORT + bool "Linked list sorting test" + depends on DEBUG_KERNEL + help + Enable this to turn on 'list_sort()' function test. This test is + executed only once during system boot, so affects only boot time. + + If unsure, say N. + config DEBUG_SG bool "Debug SG table operations" depends on DEBUG_KERNEL @@ -765,10 +872,46 @@ config RCU_CPU_STALL_DETECTOR CPUs are delaying the current grace period, but only when the grace period extends for excessive time periods. - Say Y if you want RCU to perform such checks. + Say N if you want to disable such checks. + + Say Y if you are unsure. + +config RCU_CPU_STALL_TIMEOUT + int "RCU CPU stall timeout in seconds" + depends on RCU_CPU_STALL_DETECTOR + range 3 300 + default 60 + help + If a given RCU grace period extends more than the specified + number of seconds, a CPU stall warning is printed. If the + RCU grace period persists, additional CPU stall warnings are + printed at more widely spaced intervals. + +config RCU_CPU_STALL_DETECTOR_RUNNABLE + bool "RCU CPU stall checking starts automatically at boot" + depends on RCU_CPU_STALL_DETECTOR + default y + help + If set, start checking for RCU CPU stalls immediately on + boot. Otherwise, RCU CPU stall checking must be manually + enabled. + + Say Y if you are unsure. + + Say N if you wish to suppress RCU CPU stall checking during boot. + +config RCU_CPU_STALL_VERBOSE + bool "Print additional per-task information for RCU_CPU_STALL_DETECTOR" + depends on RCU_CPU_STALL_DETECTOR && TREE_PREEMPT_RCU + default y + help + This option causes RCU to printk detailed per-task information + for any tasks that are stalling the current RCU grace period. Say N if you are unsure. + Say Y if you want to enable such checks. + config KPROBES_SANITY_TEST bool "Kprobes sanity tests" depends on DEBUG_KERNEL @@ -840,8 +983,7 @@ config DEBUG_FORCE_WEAK_PER_CPU config LKDTM tristate "Linux Kernel Dump Test Tool Module" - depends on DEBUG_KERNEL - depends on KPROBES + depends on DEBUG_FS depends on BLOCK default n help @@ -852,7 +994,19 @@ config LKDTM called lkdtm. Documentation on how to use the module can be found in - drivers/misc/lkdtm.c + Documentation/fault-injection/provoke-crashes.txt + +config CPU_NOTIFIER_ERROR_INJECT + tristate "CPU notifier error injection module" + depends on HOTPLUG_CPU && DEBUG_KERNEL + help + This option provides a kernel module that can be used to test + the error handling of the cpu notifiers + + To compile this code as a module, choose M here: the module will + be called cpu-notifier-error-inject. + + If unsure, say N. config FAULT_INJECTION bool "Fault-injection framework" @@ -881,7 +1035,7 @@ config FAIL_MAKE_REQUEST Provide fault-injection capability for disk IO. config FAIL_IO_TIMEOUT - bool "Faul-injection capability for faking disk interrupts" + bool "Fault-injection capability for faking disk interrupts" depends on FAULT_INJECTION && BLOCK help Provide fault-injection capability on end IO handling. This @@ -902,19 +1056,22 @@ config FAULT_INJECTION_STACKTRACE_FILTER depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT depends on !X86_64 select STACKTRACE - select FRAME_POINTER if !PPC && !S390 + select FRAME_POINTER if !PPC && !S390 && !MICROBLAZE help Provide stacktrace filter for fault-injection capabilities config LATENCYTOP bool "Latency measuring infrastructure" - select FRAME_POINTER if !MIPS && !PPC && !S390 + depends on HAVE_LATENCYTOP_SUPPORT + depends on DEBUG_KERNEL + depends on STACKTRACE_SUPPORT + depends on PROC_FS + select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE select KALLSYMS select KALLSYMS_ALL select STACKTRACE select SCHEDSTATS select SCHED_DEBUG - depends on HAVE_LATENCYTOP_SUPPORT help Enable this option if you want to use the LatencyTOP tool to find out which userspace is blocking on what kernel operations. @@ -995,10 +1152,10 @@ config DYNAMIC_DEBUG Usage: - Dynamic debugging is controlled via the 'dynamic_debug/ddebug' file, + Dynamic debugging is controlled via the 'dynamic_debug/control' file, which is contained in the 'debugfs' filesystem. Thus, the debugfs filesystem must first be mounted before making use of this feature. - We refer the control file as: <debugfs>/dynamic_debug/ddebug. This + We refer the control file as: <debugfs>/dynamic_debug/control. This file contains a list of the debug statements that can be enabled. The format for each line of the file is: @@ -1013,7 +1170,7 @@ config DYNAMIC_DEBUG From a live system: - nullarbor:~ # cat <debugfs>/dynamic_debug/ddebug + nullarbor:~ # cat <debugfs>/dynamic_debug/control # filename:lineno [module]function flags format fs/aio.c:222 [aio]__put_ioctx - "__put_ioctx:\040freeing\040%p\012" fs/aio.c:248 [aio]ioctx_alloc - "ENOMEM:\040nr_events\040too\040high\012" @@ -1023,23 +1180,23 @@ config DYNAMIC_DEBUG // enable the message at line 1603 of file svcsock.c nullarbor:~ # echo -n 'file svcsock.c line 1603 +p' > - <debugfs>/dynamic_debug/ddebug + <debugfs>/dynamic_debug/control // enable all the messages in file svcsock.c nullarbor:~ # echo -n 'file svcsock.c +p' > - <debugfs>/dynamic_debug/ddebug + <debugfs>/dynamic_debug/control // enable all the messages in the NFS server module nullarbor:~ # echo -n 'module nfsd +p' > - <debugfs>/dynamic_debug/ddebug + <debugfs>/dynamic_debug/control // enable all 12 messages in the function svc_process() nullarbor:~ # echo -n 'func svc_process +p' > - <debugfs>/dynamic_debug/ddebug + <debugfs>/dynamic_debug/control // disable all 12 messages in the function svc_process() nullarbor:~ # echo -n 'func svc_process -p' > - <debugfs>/dynamic_debug/ddebug + <debugfs>/dynamic_debug/control See Documentation/dynamic-debug-howto.txt for additional information. @@ -1054,6 +1211,26 @@ config DMA_API_DEBUG This option causes a performance degredation. Use only if you want to debug device drivers. If unsure, say N. +config ATOMIC64_SELFTEST + bool "Perform an atomic64_t self-test at boot" + help + Enable this option to test the atomic64_t functions at boot. + + If unsure, say N. + +config ASYNC_RAID6_TEST + tristate "Self test for hardware accelerated raid6 recovery" + depends on ASYNC_RAID6_RECOV + select ASYNC_MEMCPY + ---help--- + This is a one-shot self test that permutes through the + recovery of all the possible two disk failure scenarios for a + N-disk array. Recovery is performed with the asynchronous + raid6 recovery routines, and will optionally use an offload + engine if one is available. + + If unsure, say N. + source "samples/Kconfig" source "lib/Kconfig.kgdb" diff --git a/lib/Kconfig.kgdb b/lib/Kconfig.kgdb index 9b5d1d7f2ef7..43cb93fa2651 100644 --- a/lib/Kconfig.kgdb +++ b/lib/Kconfig.kgdb @@ -3,7 +3,7 @@ config HAVE_ARCH_KGDB bool menuconfig KGDB - bool "KGDB: kernel debugging with remote gdb" + bool "KGDB: kernel debugger" depends on HAVE_ARCH_KGDB depends on DEBUG_KERNEL && EXPERIMENTAL help @@ -57,4 +57,26 @@ config KGDB_TESTS_BOOT_STRING information about other strings you could use beyond the default of V1F100. +config KGDB_LOW_LEVEL_TRAP + bool "KGDB: Allow debugging with traps in notifiers" + depends on X86 || MIPS + default n + help + This will add an extra call back to kgdb for the breakpoint + exception handler on which will will allow kgdb to step + through a notify handler. + +config KGDB_KDB + bool "KGDB_KDB: include kdb frontend for kgdb" + default n + help + KDB frontend for kernel + +config KDB_KEYBOARD + bool "KGDB_KDB: keyboard as input device" + depends on VT && KGDB_KDB + default n + help + KDB can use a PS/2 type keyboard for an input device + endif # KGDB diff --git a/lib/Makefile b/lib/Makefile index 3b0b4a696db9..cbb774f7d41d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -8,11 +8,11 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) endif lib-y := ctype.o string.o vsprintf.o cmdline.o \ - rbtree.o radix-tree.o dump_stack.o \ + rbtree.o radix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o prio_tree.o \ sha1.o irq_regs.o reciprocal_div.o argv_split.o \ proportions.o prio_heap.o ratelimit.o show_mem.o \ - is_single_threaded.o plist.o decompress.o flex_array.o + is_single_threaded.o plist.o decompress.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o @@ -21,7 +21,7 @@ lib-y += kobject.o kref.o klist.o obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ - string_helpers.o gcd.o list_sort.o + string_helpers.o gcd.o lcm.o list_sort.o uuid.o flex_array.o ifeq ($(CONFIG_DEBUG_KOBJECT),y) CFLAGS_kobject.o += -DDEBUG @@ -39,8 +39,12 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_GENERIC_FIND_FIRST_BIT) += find_next_bit.o lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o obj-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o + +CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS)) obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o + obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o +obj-$(CONFIG_BTREE) += btree.o obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o obj-$(CONFIG_DEBUG_LIST) += list_debug.o obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o @@ -65,10 +69,13 @@ obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ obj-$(CONFIG_LZO_COMPRESS) += lzo/ obj-$(CONFIG_LZO_DECOMPRESS) += lzo/ +obj-$(CONFIG_XZ_DEC) += xz/ +obj-$(CONFIG_RAID6_PQ) += raid6/ lib-$(CONFIG_DECOMPRESS_GZIP) += decompress_inflate.o lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o lib-$(CONFIG_DECOMPRESS_LZMA) += decompress_unlzma.o +lib-$(CONFIG_DECOMPRESS_XZ) += decompress_unxz.o lib-$(CONFIG_DECOMPRESS_LZO) += decompress_unlzo.o obj-$(CONFIG_TEXTSEARCH) += textsearch.o @@ -81,11 +88,10 @@ obj-$(CONFIG_AUDIT_GENERIC) += audit.o obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o +obj-$(CONFIG_CPU_NOTIFIER_ERROR_INJECT) += cpu-notifier-error-inject.o lib-$(CONFIG_GENERIC_BUG) += bug.o -obj-$(CONFIG_HAVE_LMB) += lmb.o - obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o @@ -100,6 +106,10 @@ obj-$(CONFIG_GENERIC_CSUM) += checksum.o obj-$(CONFIG_GENERIC_ATOMIC64) += atomic64.o +obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o + +obj-$(CONFIG_AVERAGE) += average.o + hostprogs-y := gen_crc32table clean-files := crc32table.h diff --git a/lib/atomic64.c b/lib/atomic64.c index 8bee16ec7524..a21c12bc727c 100644 --- a/lib/atomic64.c +++ b/lib/atomic64.c @@ -162,12 +162,12 @@ int atomic64_add_unless(atomic64_t *v, long long a, long long u) { unsigned long flags; spinlock_t *lock = lock_addr(v); - int ret = 1; + int ret = 0; spin_lock_irqsave(lock, flags); if (v->counter != u) { v->counter += a; - ret = 0; + ret = 1; } spin_unlock_irqrestore(lock, flags); return ret; diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c new file mode 100644 index 000000000000..44524cc8c32a --- /dev/null +++ b/lib/atomic64_test.c @@ -0,0 +1,166 @@ +/* + * Testsuite for atomic64_t functions + * + * Copyright © 2010 Luca Barbieri + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <asm/atomic.h> + +#define INIT(c) do { atomic64_set(&v, c); r = c; } while (0) +static __init int test_atomic64(void) +{ + long long v0 = 0xaaa31337c001d00dLL; + long long v1 = 0xdeadbeefdeafcafeLL; + long long v2 = 0xfaceabadf00df001LL; + long long onestwos = 0x1111111122222222LL; + long long one = 1LL; + + atomic64_t v = ATOMIC64_INIT(v0); + long long r = v0; + BUG_ON(v.counter != r); + + atomic64_set(&v, v1); + r = v1; + BUG_ON(v.counter != r); + BUG_ON(atomic64_read(&v) != r); + + INIT(v0); + atomic64_add(onestwos, &v); + r += onestwos; + BUG_ON(v.counter != r); + + INIT(v0); + atomic64_add(-one, &v); + r += -one; + BUG_ON(v.counter != r); + + INIT(v0); + r += onestwos; + BUG_ON(atomic64_add_return(onestwos, &v) != r); + BUG_ON(v.counter != r); + + INIT(v0); + r += -one; + BUG_ON(atomic64_add_return(-one, &v) != r); + BUG_ON(v.counter != r); + + INIT(v0); + atomic64_sub(onestwos, &v); + r -= onestwos; + BUG_ON(v.counter != r); + + INIT(v0); + atomic64_sub(-one, &v); + r -= -one; + BUG_ON(v.counter != r); + + INIT(v0); + r -= onestwos; + BUG_ON(atomic64_sub_return(onestwos, &v) != r); + BUG_ON(v.counter != r); + + INIT(v0); + r -= -one; + BUG_ON(atomic64_sub_return(-one, &v) != r); + BUG_ON(v.counter != r); + + INIT(v0); + atomic64_inc(&v); + r += one; + BUG_ON(v.counter != r); + + INIT(v0); + r += one; + BUG_ON(atomic64_inc_return(&v) != r); + BUG_ON(v.counter != r); + + INIT(v0); + atomic64_dec(&v); + r -= one; + BUG_ON(v.counter != r); + + INIT(v0); + r -= one; + BUG_ON(atomic64_dec_return(&v) != r); + BUG_ON(v.counter != r); + + INIT(v0); + BUG_ON(atomic64_xchg(&v, v1) != v0); + r = v1; + BUG_ON(v.counter != r); + + INIT(v0); + BUG_ON(atomic64_cmpxchg(&v, v0, v1) != v0); + r = v1; + BUG_ON(v.counter != r); + + INIT(v0); + BUG_ON(atomic64_cmpxchg(&v, v2, v1) != v0); + BUG_ON(v.counter != r); + + INIT(v0); + BUG_ON(atomic64_add_unless(&v, one, v0)); + BUG_ON(v.counter != r); + + INIT(v0); + BUG_ON(!atomic64_add_unless(&v, one, v1)); + r += one; + BUG_ON(v.counter != r); + +#if defined(CONFIG_X86) || defined(CONFIG_MIPS) || defined(CONFIG_PPC) || \ + defined(CONFIG_S390) || defined(_ASM_GENERIC_ATOMIC64_H) || defined(CONFIG_ARM) + INIT(onestwos); + BUG_ON(atomic64_dec_if_positive(&v) != (onestwos - 1)); + r -= one; + BUG_ON(v.counter != r); + + INIT(0); + BUG_ON(atomic64_dec_if_positive(&v) != -one); + BUG_ON(v.counter != r); + + INIT(-one); + BUG_ON(atomic64_dec_if_positive(&v) != (-one - one)); + BUG_ON(v.counter != r); +#else +#warning Please implement atomic64_dec_if_positive for your architecture, and add it to the IF above +#endif + + INIT(onestwos); + BUG_ON(!atomic64_inc_not_zero(&v)); + r += one; + BUG_ON(v.counter != r); + + INIT(0); + BUG_ON(atomic64_inc_not_zero(&v)); + BUG_ON(v.counter != r); + + INIT(-one); + BUG_ON(!atomic64_inc_not_zero(&v)); + r += one; + BUG_ON(v.counter != r); + +#ifdef CONFIG_X86 + printk(KERN_INFO "atomic64 test passed for %s platform %s CX8 and %s SSE\n", +#ifdef CONFIG_X86_64 + "x86-64", +#elif defined(CONFIG_X86_CMPXCHG64) + "i586+", +#else + "i386+", +#endif + boot_cpu_has(X86_FEATURE_CX8) ? "with" : "without", + boot_cpu_has(X86_FEATURE_XMM) ? "with" : "without"); +#else + printk(KERN_INFO "atomic64 test passed\n"); +#endif + + return 0; +} + +core_initcall(test_atomic64); diff --git a/lib/average.c b/lib/average.c new file mode 100644 index 000000000000..5576c2841496 --- /dev/null +++ b/lib/average.c @@ -0,0 +1,61 @@ +/* + * lib/average.c + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/module.h> +#include <linux/average.h> +#include <linux/bug.h> +#include <linux/log2.h> + +/** + * DOC: Exponentially Weighted Moving Average (EWMA) + * + * These are generic functions for calculating Exponentially Weighted Moving + * Averages (EWMA). We keep a structure with the EWMA parameters and a scaled + * up internal representation of the average value to prevent rounding errors. + * The factor for scaling up and the exponential weight (or decay rate) have to + * be specified thru the init fuction. The structure should not be accessed + * directly but only thru the helper functions. + */ + +/** + * ewma_init() - Initialize EWMA parameters + * @avg: Average structure + * @factor: Factor to use for the scaled up internal value. The maximum value + * of averages can be ULONG_MAX/(factor*weight). For performance reasons + * factor has to be a power of 2. + * @weight: Exponential weight, or decay rate. This defines how fast the + * influence of older values decreases. For performance reasons weight has + * to be a power of 2. + * + * Initialize the EWMA parameters for a given struct ewma @avg. + */ +void ewma_init(struct ewma *avg, unsigned long factor, unsigned long weight) +{ + WARN_ON(!is_power_of_2(weight) || !is_power_of_2(factor)); + + avg->weight = ilog2(weight); + avg->factor = ilog2(factor); + avg->internal = 0; +} +EXPORT_SYMBOL(ewma_init); + +/** + * ewma_add() - Exponentially weighted moving average (EWMA) + * @avg: Average structure + * @val: Current value + * + * Add a sample to the average. + */ +struct ewma *ewma_add(struct ewma *avg, unsigned long val) +{ + avg->internal = avg->internal ? + (((avg->internal << avg->weight) - avg->internal) + + (val << avg->factor)) >> avg->weight : + (val << avg->factor); + return avg; +} +EXPORT_SYMBOL(ewma_add); diff --git a/lib/bitmap.c b/lib/bitmap.c index 11bf49750583..741fae905ae3 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -359,7 +359,6 @@ EXPORT_SYMBOL(bitmap_find_next_zero_area); #define CHUNKSZ 32 #define nbits_to_hold_value(val) fls(val) -#define unhex(c) (isdigit(c) ? (c - '0') : (toupper(c) - 'A' + 10)) #define BASEDEC 10 /* fancier cpuset lists input in decimal */ /** @@ -466,7 +465,7 @@ int __bitmap_parse(const char *buf, unsigned int buflen, if (chunk & ~((1UL << (CHUNKSZ - 4)) - 1)) return -EOVERFLOW; - chunk = (chunk << 4) | unhex(c); + chunk = (chunk << 4) | hex_to_bin(c); ndigits++; totaldigits++; } if (ndigits == 0) @@ -487,7 +486,7 @@ int __bitmap_parse(const char *buf, unsigned int buflen, EXPORT_SYMBOL(__bitmap_parse); /** - * bitmap_parse_user() + * bitmap_parse_user - convert an ASCII hex string in a user buffer into a bitmap * * @ubuf: pointer to user buffer containing string. * @ulen: buffer size in bytes. If string is smaller than this @@ -619,7 +618,7 @@ int bitmap_parselist(const char *bp, unsigned long *maskp, int nmaskbits) EXPORT_SYMBOL(bitmap_parselist); /** - * bitmap_pos_to_ord(buf, pos, bits) + * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap * @buf: pointer to a bitmap * @pos: a bit position in @buf (0 <= @pos < @bits) * @bits: number of valid bit positions in @buf @@ -655,7 +654,7 @@ static int bitmap_pos_to_ord(const unsigned long *buf, int pos, int bits) } /** - * bitmap_ord_to_pos(buf, ord, bits) + * bitmap_ord_to_pos - find position of n-th set bit in bitmap * @buf: pointer to bitmap * @ord: ordinal bit position (n-th set bit, n >= 0) * @bits: number of valid bit positions in @buf @@ -733,10 +732,9 @@ void bitmap_remap(unsigned long *dst, const unsigned long *src, bitmap_zero(dst, bits); w = bitmap_weight(new, bits); - for (oldbit = find_first_bit(src, bits); - oldbit < bits; - oldbit = find_next_bit(src, bits, oldbit + 1)) { + for_each_set_bit(oldbit, src, bits) { int n = bitmap_pos_to_ord(old, oldbit, bits); + if (n < 0 || w == 0) set_bit(oldbit, dst); /* identity map */ else @@ -903,9 +901,7 @@ void bitmap_onto(unsigned long *dst, const unsigned long *orig, */ m = 0; - for (n = find_first_bit(relmap, bits); - n < bits; - n = find_next_bit(relmap, bits, n + 1)) { + for_each_set_bit(n, relmap, bits) { /* m == bitmap_pos_to_ord(relmap, n, bits) */ if (test_bit(m, orig)) set_bit(n, dst); @@ -934,9 +930,7 @@ void bitmap_fold(unsigned long *dst, const unsigned long *orig, return; bitmap_zero(dst, bits); - for (oldbit = find_first_bit(orig, bits); - oldbit < bits; - oldbit = find_next_bit(orig, bits, oldbit + 1)) + for_each_set_bit(oldbit, orig, bits) set_bit(oldbit % sz, dst); } EXPORT_SYMBOL(bitmap_fold); diff --git a/lib/btree.c b/lib/btree.c new file mode 100644 index 000000000000..c9c6f0351526 --- /dev/null +++ b/lib/btree.c @@ -0,0 +1,798 @@ +/* + * lib/btree.c - Simple In-memory B+Tree + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2007-2008 Joern Engel <joern@logfs.org> + * Bits and pieces stolen from Peter Zijlstra's code, which is + * Copyright 2007, Red Hat Inc. Peter Zijlstra <pzijlstr@redhat.com> + * GPLv2 + * + * see http://programming.kicks-ass.net/kernel-patches/vma_lookup/btree.patch + * + * A relatively simple B+Tree implementation. I have written it as a learning + * excercise to understand how B+Trees work. Turned out to be useful as well. + * + * B+Trees can be used similar to Linux radix trees (which don't have anything + * in common with textbook radix trees, beware). Prerequisite for them working + * well is that access to a random tree node is much faster than a large number + * of operations within each node. + * + * Disks have fulfilled the prerequisite for a long time. More recently DRAM + * has gained similar properties, as memory access times, when measured in cpu + * cycles, have increased. Cacheline sizes have increased as well, which also + * helps B+Trees. + * + * Compared to radix trees, B+Trees are more efficient when dealing with a + * sparsely populated address space. Between 25% and 50% of the memory is + * occupied with valid pointers. When densely populated, radix trees contain + * ~98% pointers - hard to beat. Very sparse radix trees contain only ~2% + * pointers. + * + * This particular implementation stores pointers identified by a long value. + * Storing NULL pointers is illegal, lookup will return NULL when no entry + * was found. + * + * A tricks was used that is not commonly found in textbooks. The lowest + * values are to the right, not to the left. All used slots within a node + * are on the left, all unused slots contain NUL values. Most operations + * simply loop once over all slots and terminate on the first NUL. + */ + +#include <linux/btree.h> +#include <linux/cache.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/module.h> + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define NODESIZE MAX(L1_CACHE_BYTES, 128) + +struct btree_geo { + int keylen; + int no_pairs; + int no_longs; +}; + +struct btree_geo btree_geo32 = { + .keylen = 1, + .no_pairs = NODESIZE / sizeof(long) / 2, + .no_longs = NODESIZE / sizeof(long) / 2, +}; +EXPORT_SYMBOL_GPL(btree_geo32); + +#define LONG_PER_U64 (64 / BITS_PER_LONG) +struct btree_geo btree_geo64 = { + .keylen = LONG_PER_U64, + .no_pairs = NODESIZE / sizeof(long) / (1 + LONG_PER_U64), + .no_longs = LONG_PER_U64 * (NODESIZE / sizeof(long) / (1 + LONG_PER_U64)), +}; +EXPORT_SYMBOL_GPL(btree_geo64); + +struct btree_geo btree_geo128 = { + .keylen = 2 * LONG_PER_U64, + .no_pairs = NODESIZE / sizeof(long) / (1 + 2 * LONG_PER_U64), + .no_longs = 2 * LONG_PER_U64 * (NODESIZE / sizeof(long) / (1 + 2 * LONG_PER_U64)), +}; +EXPORT_SYMBOL_GPL(btree_geo128); + +static struct kmem_cache *btree_cachep; + +void *btree_alloc(gfp_t gfp_mask, void *pool_data) +{ + return kmem_cache_alloc(btree_cachep, gfp_mask); +} +EXPORT_SYMBOL_GPL(btree_alloc); + +void btree_free(void *element, void *pool_data) +{ + kmem_cache_free(btree_cachep, element); +} +EXPORT_SYMBOL_GPL(btree_free); + +static unsigned long *btree_node_alloc(struct btree_head *head, gfp_t gfp) +{ + unsigned long *node; + + node = mempool_alloc(head->mempool, gfp); + if (likely(node)) + memset(node, 0, NODESIZE); + return node; +} + +static int longcmp(const unsigned long *l1, const unsigned long *l2, size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) { + if (l1[i] < l2[i]) + return -1; + if (l1[i] > l2[i]) + return 1; + } + return 0; +} + +static unsigned long *longcpy(unsigned long *dest, const unsigned long *src, + size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) + dest[i] = src[i]; + return dest; +} + +static unsigned long *longset(unsigned long *s, unsigned long c, size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) + s[i] = c; + return s; +} + +static void dec_key(struct btree_geo *geo, unsigned long *key) +{ + unsigned long val; + int i; + + for (i = geo->keylen - 1; i >= 0; i--) { + val = key[i]; + key[i] = val - 1; + if (val) + break; + } +} + +static unsigned long *bkey(struct btree_geo *geo, unsigned long *node, int n) +{ + return &node[n * geo->keylen]; +} + +static void *bval(struct btree_geo *geo, unsigned long *node, int n) +{ + return (void *)node[geo->no_longs + n]; +} + +static void setkey(struct btree_geo *geo, unsigned long *node, int n, + unsigned long *key) +{ + longcpy(bkey(geo, node, n), key, geo->keylen); +} + +static void setval(struct btree_geo *geo, unsigned long *node, int n, + void *val) +{ + node[geo->no_longs + n] = (unsigned long) val; +} + +static void clearpair(struct btree_geo *geo, unsigned long *node, int n) +{ + longset(bkey(geo, node, n), 0, geo->keylen); + node[geo->no_longs + n] = 0; +} + +static inline void __btree_init(struct btree_head *head) +{ + head->node = NULL; + head->height = 0; +} + +void btree_init_mempool(struct btree_head *head, mempool_t *mempool) +{ + __btree_init(head); + head->mempool = mempool; +} +EXPORT_SYMBOL_GPL(btree_init_mempool); + +int btree_init(struct btree_head *head) +{ + __btree_init(head); + head->mempool = mempool_create(0, btree_alloc, btree_free, NULL); + if (!head->mempool) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL_GPL(btree_init); + +void btree_destroy(struct btree_head *head) +{ + mempool_destroy(head->mempool); + head->mempool = NULL; +} +EXPORT_SYMBOL_GPL(btree_destroy); + +void *btree_last(struct btree_head *head, struct btree_geo *geo, + unsigned long *key) +{ + int height = head->height; + unsigned long *node = head->node; + + if (height == 0) + return NULL; + + for ( ; height > 1; height--) + node = bval(geo, node, 0); + + longcpy(key, bkey(geo, node, 0), geo->keylen); + return bval(geo, node, 0); +} +EXPORT_SYMBOL_GPL(btree_last); + +static int keycmp(struct btree_geo *geo, unsigned long *node, int pos, + unsigned long *key) +{ + return longcmp(bkey(geo, node, pos), key, geo->keylen); +} + +static int keyzero(struct btree_geo *geo, unsigned long *key) +{ + int i; + + for (i = 0; i < geo->keylen; i++) + if (key[i]) + return 0; + + return 1; +} + +void *btree_lookup(struct btree_head *head, struct btree_geo *geo, + unsigned long *key) +{ + int i, height = head->height; + unsigned long *node = head->node; + + if (height == 0) + return NULL; + + for ( ; height > 1; height--) { + for (i = 0; i < geo->no_pairs; i++) + if (keycmp(geo, node, i, key) <= 0) + break; + if (i == geo->no_pairs) + return NULL; + node = bval(geo, node, i); + if (!node) + return NULL; + } + + if (!node) + return NULL; + + for (i = 0; i < geo->no_pairs; i++) + if (keycmp(geo, node, i, key) == 0) + return bval(geo, node, i); + return NULL; +} +EXPORT_SYMBOL_GPL(btree_lookup); + +int btree_update(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, void *val) +{ + int i, height = head->height; + unsigned long *node = head->node; + + if (height == 0) + return -ENOENT; + + for ( ; height > 1; height--) { + for (i = 0; i < geo->no_pairs; i++) + if (keycmp(geo, node, i, key) <= 0) + break; + if (i == geo->no_pairs) + return -ENOENT; + node = bval(geo, node, i); + if (!node) + return -ENOENT; + } + + if (!node) + return -ENOENT; + + for (i = 0; i < geo->no_pairs; i++) + if (keycmp(geo, node, i, key) == 0) { + setval(geo, node, i, val); + return 0; + } + return -ENOENT; +} +EXPORT_SYMBOL_GPL(btree_update); + +/* + * Usually this function is quite similar to normal lookup. But the key of + * a parent node may be smaller than the smallest key of all its siblings. + * In such a case we cannot just return NULL, as we have only proven that no + * key smaller than __key, but larger than this parent key exists. + * So we set __key to the parent key and retry. We have to use the smallest + * such parent key, which is the last parent key we encountered. + */ +void *btree_get_prev(struct btree_head *head, struct btree_geo *geo, + unsigned long *__key) +{ + int i, height; + unsigned long *node, *oldnode; + unsigned long *retry_key = NULL, key[geo->keylen]; + + if (keyzero(geo, __key)) + return NULL; + + if (head->height == 0) + return NULL; +retry: + longcpy(key, __key, geo->keylen); + dec_key(geo, key); + + node = head->node; + for (height = head->height ; height > 1; height--) { + for (i = 0; i < geo->no_pairs; i++) + if (keycmp(geo, node, i, key) <= 0) + break; + if (i == geo->no_pairs) + goto miss; + oldnode = node; + node = bval(geo, node, i); + if (!node) + goto miss; + retry_key = bkey(geo, oldnode, i); + } + + if (!node) + goto miss; + + for (i = 0; i < geo->no_pairs; i++) { + if (keycmp(geo, node, i, key) <= 0) { + if (bval(geo, node, i)) { + longcpy(__key, bkey(geo, node, i), geo->keylen); + return bval(geo, node, i); + } else + goto miss; + } + } +miss: + if (retry_key) { + __key = retry_key; + retry_key = NULL; + goto retry; + } + return NULL; +} + +static int getpos(struct btree_geo *geo, unsigned long *node, + unsigned long *key) +{ + int i; + + for (i = 0; i < geo->no_pairs; i++) { + if (keycmp(geo, node, i, key) <= 0) + break; + } + return i; +} + +static int getfill(struct btree_geo *geo, unsigned long *node, int start) +{ + int i; + + for (i = start; i < geo->no_pairs; i++) + if (!bval(geo, node, i)) + break; + return i; +} + +/* + * locate the correct leaf node in the btree + */ +static unsigned long *find_level(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, int level) +{ + unsigned long *node = head->node; + int i, height; + + for (height = head->height; height > level; height--) { + for (i = 0; i < geo->no_pairs; i++) + if (keycmp(geo, node, i, key) <= 0) + break; + + if ((i == geo->no_pairs) || !bval(geo, node, i)) { + /* right-most key is too large, update it */ + /* FIXME: If the right-most key on higher levels is + * always zero, this wouldn't be necessary. */ + i--; + setkey(geo, node, i, key); + } + BUG_ON(i < 0); + node = bval(geo, node, i); + } + BUG_ON(!node); + return node; +} + +static int btree_grow(struct btree_head *head, struct btree_geo *geo, + gfp_t gfp) +{ + unsigned long *node; + int fill; + + node = btree_node_alloc(head, gfp); + if (!node) + return -ENOMEM; + if (head->node) { + fill = getfill(geo, head->node, 0); + setkey(geo, node, 0, bkey(geo, head->node, fill - 1)); + setval(geo, node, 0, head->node); + } + head->node = node; + head->height++; + return 0; +} + +static void btree_shrink(struct btree_head *head, struct btree_geo *geo) +{ + unsigned long *node; + int fill; + + if (head->height <= 1) + return; + + node = head->node; + fill = getfill(geo, node, 0); + BUG_ON(fill > 1); + head->node = bval(geo, node, 0); + head->height--; + mempool_free(node, head->mempool); +} + +static int btree_insert_level(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, void *val, int level, + gfp_t gfp) +{ + unsigned long *node; + int i, pos, fill, err; + + BUG_ON(!val); + if (head->height < level) { + err = btree_grow(head, geo, gfp); + if (err) + return err; + } + +retry: + node = find_level(head, geo, key, level); + pos = getpos(geo, node, key); + fill = getfill(geo, node, pos); + /* two identical keys are not allowed */ + BUG_ON(pos < fill && keycmp(geo, node, pos, key) == 0); + + if (fill == geo->no_pairs) { + /* need to split node */ + unsigned long *new; + + new = btree_node_alloc(head, gfp); + if (!new) + return -ENOMEM; + err = btree_insert_level(head, geo, + bkey(geo, node, fill / 2 - 1), + new, level + 1, gfp); + if (err) { + mempool_free(new, head->mempool); + return err; + } + for (i = 0; i < fill / 2; i++) { + setkey(geo, new, i, bkey(geo, node, i)); + setval(geo, new, i, bval(geo, node, i)); + setkey(geo, node, i, bkey(geo, node, i + fill / 2)); + setval(geo, node, i, bval(geo, node, i + fill / 2)); + clearpair(geo, node, i + fill / 2); + } + if (fill & 1) { + setkey(geo, node, i, bkey(geo, node, fill - 1)); + setval(geo, node, i, bval(geo, node, fill - 1)); + clearpair(geo, node, fill - 1); + } + goto retry; + } + BUG_ON(fill >= geo->no_pairs); + + /* shift and insert */ + for (i = fill; i > pos; i--) { + setkey(geo, node, i, bkey(geo, node, i - 1)); + setval(geo, node, i, bval(geo, node, i - 1)); + } + setkey(geo, node, pos, key); + setval(geo, node, pos, val); + + return 0; +} + +int btree_insert(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, void *val, gfp_t gfp) +{ + return btree_insert_level(head, geo, key, val, 1, gfp); +} +EXPORT_SYMBOL_GPL(btree_insert); + +static void *btree_remove_level(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, int level); +static void merge(struct btree_head *head, struct btree_geo *geo, int level, + unsigned long *left, int lfill, + unsigned long *right, int rfill, + unsigned long *parent, int lpos) +{ + int i; + + for (i = 0; i < rfill; i++) { + /* Move all keys to the left */ + setkey(geo, left, lfill + i, bkey(geo, right, i)); + setval(geo, left, lfill + i, bval(geo, right, i)); + } + /* Exchange left and right child in parent */ + setval(geo, parent, lpos, right); + setval(geo, parent, lpos + 1, left); + /* Remove left (formerly right) child from parent */ + btree_remove_level(head, geo, bkey(geo, parent, lpos), level + 1); + mempool_free(right, head->mempool); +} + +static void rebalance(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, int level, unsigned long *child, int fill) +{ + unsigned long *parent, *left = NULL, *right = NULL; + int i, no_left, no_right; + + if (fill == 0) { + /* Because we don't steal entries from a neigbour, this case + * can happen. Parent node contains a single child, this + * node, so merging with a sibling never happens. + */ + btree_remove_level(head, geo, key, level + 1); + mempool_free(child, head->mempool); + return; + } + + parent = find_level(head, geo, key, level + 1); + i = getpos(geo, parent, key); + BUG_ON(bval(geo, parent, i) != child); + + if (i > 0) { + left = bval(geo, parent, i - 1); + no_left = getfill(geo, left, 0); + if (fill + no_left <= geo->no_pairs) { + merge(head, geo, level, + left, no_left, + child, fill, + parent, i - 1); + return; + } + } + if (i + 1 < getfill(geo, parent, i)) { + right = bval(geo, parent, i + 1); + no_right = getfill(geo, right, 0); + if (fill + no_right <= geo->no_pairs) { + merge(head, geo, level, + child, fill, + right, no_right, + parent, i); + return; + } + } + /* + * We could also try to steal one entry from the left or right + * neighbor. By not doing so we changed the invariant from + * "all nodes are at least half full" to "no two neighboring + * nodes can be merged". Which means that the average fill of + * all nodes is still half or better. + */ +} + +static void *btree_remove_level(struct btree_head *head, struct btree_geo *geo, + unsigned long *key, int level) +{ + unsigned long *node; + int i, pos, fill; + void *ret; + + if (level > head->height) { + /* we recursed all the way up */ + head->height = 0; + head->node = NULL; + return NULL; + } + + node = find_level(head, geo, key, level); + pos = getpos(geo, node, key); + fill = getfill(geo, node, pos); + if ((level == 1) && (keycmp(geo, node, pos, key) != 0)) + return NULL; + ret = bval(geo, node, pos); + + /* remove and shift */ + for (i = pos; i < fill - 1; i++) { + setkey(geo, node, i, bkey(geo, node, i + 1)); + setval(geo, node, i, bval(geo, node, i + 1)); + } + clearpair(geo, node, fill - 1); + + if (fill - 1 < geo->no_pairs / 2) { + if (level < head->height) + rebalance(head, geo, key, level, node, fill - 1); + else if (fill - 1 == 1) + btree_shrink(head, geo); + } + + return ret; +} + +void *btree_remove(struct btree_head *head, struct btree_geo *geo, + unsigned long *key) +{ + if (head->height == 0) + return NULL; + + return btree_remove_level(head, geo, key, 1); +} +EXPORT_SYMBOL_GPL(btree_remove); + +int btree_merge(struct btree_head *target, struct btree_head *victim, + struct btree_geo *geo, gfp_t gfp) +{ + unsigned long key[geo->keylen]; + unsigned long dup[geo->keylen]; + void *val; + int err; + + BUG_ON(target == victim); + + if (!(target->node)) { + /* target is empty, just copy fields over */ + target->node = victim->node; + target->height = victim->height; + __btree_init(victim); + return 0; + } + + /* TODO: This needs some optimizations. Currently we do three tree + * walks to remove a single object from the victim. + */ + for (;;) { + if (!btree_last(victim, geo, key)) + break; + val = btree_lookup(victim, geo, key); + err = btree_insert(target, geo, key, val, gfp); + if (err) + return err; + /* We must make a copy of the key, as the original will get + * mangled inside btree_remove. */ + longcpy(dup, key, geo->keylen); + btree_remove(victim, geo, dup); + } + return 0; +} +EXPORT_SYMBOL_GPL(btree_merge); + +static size_t __btree_for_each(struct btree_head *head, struct btree_geo *geo, + unsigned long *node, unsigned long opaque, + void (*func)(void *elem, unsigned long opaque, + unsigned long *key, size_t index, + void *func2), + void *func2, int reap, int height, size_t count) +{ + int i; + unsigned long *child; + + for (i = 0; i < geo->no_pairs; i++) { + child = bval(geo, node, i); + if (!child) + break; + if (height > 1) + count = __btree_for_each(head, geo, child, opaque, + func, func2, reap, height - 1, count); + else + func(child, opaque, bkey(geo, node, i), count++, + func2); + } + if (reap) + mempool_free(node, head->mempool); + return count; +} + +static void empty(void *elem, unsigned long opaque, unsigned long *key, + size_t index, void *func2) +{ +} + +void visitorl(void *elem, unsigned long opaque, unsigned long *key, + size_t index, void *__func) +{ + visitorl_t func = __func; + + func(elem, opaque, *key, index); +} +EXPORT_SYMBOL_GPL(visitorl); + +void visitor32(void *elem, unsigned long opaque, unsigned long *__key, + size_t index, void *__func) +{ + visitor32_t func = __func; + u32 *key = (void *)__key; + + func(elem, opaque, *key, index); +} +EXPORT_SYMBOL_GPL(visitor32); + +void visitor64(void *elem, unsigned long opaque, unsigned long *__key, + size_t index, void *__func) +{ + visitor64_t func = __func; + u64 *key = (void *)__key; + + func(elem, opaque, *key, index); +} +EXPORT_SYMBOL_GPL(visitor64); + +void visitor128(void *elem, unsigned long opaque, unsigned long *__key, + size_t index, void *__func) +{ + visitor128_t func = __func; + u64 *key = (void *)__key; + + func(elem, opaque, key[0], key[1], index); +} +EXPORT_SYMBOL_GPL(visitor128); + +size_t btree_visitor(struct btree_head *head, struct btree_geo *geo, + unsigned long opaque, + void (*func)(void *elem, unsigned long opaque, + unsigned long *key, + size_t index, void *func2), + void *func2) +{ + size_t count = 0; + + if (!func2) + func = empty; + if (head->node) + count = __btree_for_each(head, geo, head->node, opaque, func, + func2, 0, head->height, 0); + return count; +} +EXPORT_SYMBOL_GPL(btree_visitor); + +size_t btree_grim_visitor(struct btree_head *head, struct btree_geo *geo, + unsigned long opaque, + void (*func)(void *elem, unsigned long opaque, + unsigned long *key, + size_t index, void *func2), + void *func2) +{ + size_t count = 0; + + if (!func2) + func = empty; + if (head->node) + count = __btree_for_each(head, geo, head->node, opaque, func, + func2, 1, head->height, 0); + __btree_init(head); + return count; +} +EXPORT_SYMBOL_GPL(btree_grim_visitor); + +static int __init btree_module_init(void) +{ + btree_cachep = kmem_cache_create("btree_node", NODESIZE, 0, + SLAB_HWCACHE_ALIGN, NULL); + return 0; +} + +static void __exit btree_module_exit(void) +{ + kmem_cache_destroy(btree_cachep); +} + +/* If core code starts using btree, initialization should happen even earlier */ +module_init(btree_module_init); +module_exit(btree_module_exit); + +MODULE_AUTHOR("Joern Engel <joern@logfs.org>"); +MODULE_AUTHOR("Johannes Berg <johannes@sipsolutions.net>"); +MODULE_LICENSE("GPL"); diff --git a/lib/bug.c b/lib/bug.c index 300e41afbf97..19552096d16b 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -72,8 +72,8 @@ static const struct bug_entry *module_find_bug(unsigned long bugaddr) return NULL; } -int module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, - struct module *mod) +void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, + struct module *mod) { char *secstrings; unsigned int i; @@ -97,8 +97,6 @@ int module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, * could potentially lead to deadlock and thus be counter-productive. */ list_add(&mod->bug_list, &module_bug_list); - - return 0; } void module_bug_cleanup(struct module *mod) @@ -136,8 +134,6 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) bug = find_bug(bugaddr); - printk(KERN_EMERG "------------[ cut here ]------------\n"); - file = NULL; line = 0; warning = 0; @@ -156,19 +152,25 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) if (warning) { /* this is a WARN_ON rather than BUG/BUG_ON */ + printk(KERN_WARNING "------------[ cut here ]------------\n"); + if (file) - printk(KERN_ERR "Badness at %s:%u\n", + printk(KERN_WARNING "WARNING: at %s:%u\n", file, line); else - printk(KERN_ERR "Badness at %p " + printk(KERN_WARNING "WARNING: at %p " "[verbose debug info unavailable]\n", (void *)bugaddr); + print_modules(); show_regs(regs); - add_taint(TAINT_WARN); + print_oops_end_marker(); + add_taint(BUG_GET_TAINT(bug)); return BUG_TRAP_TYPE_WARN; } + printk(KERN_EMERG "------------[ cut here ]------------\n"); + if (file) printk(KERN_CRIT "kernel BUG at %s:%u!\n", file, line); diff --git a/lib/cpu-notifier-error-inject.c b/lib/cpu-notifier-error-inject.c new file mode 100644 index 000000000000..4dc20321b0d5 --- /dev/null +++ b/lib/cpu-notifier-error-inject.c @@ -0,0 +1,63 @@ +#include <linux/kernel.h> +#include <linux/cpu.h> +#include <linux/module.h> +#include <linux/notifier.h> + +static int priority; +static int cpu_up_prepare_error; +static int cpu_down_prepare_error; + +module_param(priority, int, 0); +MODULE_PARM_DESC(priority, "specify cpu notifier priority"); + +module_param(cpu_up_prepare_error, int, 0644); +MODULE_PARM_DESC(cpu_up_prepare_error, + "specify error code to inject CPU_UP_PREPARE action"); + +module_param(cpu_down_prepare_error, int, 0644); +MODULE_PARM_DESC(cpu_down_prepare_error, + "specify error code to inject CPU_DOWN_PREPARE action"); + +static int err_inject_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int err = 0; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + err = cpu_up_prepare_error; + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + err = cpu_down_prepare_error; + break; + } + if (err) + printk(KERN_INFO "Injecting error (%d) at cpu notifier\n", err); + + return notifier_from_errno(err); +} + +static struct notifier_block err_inject_cpu_notifier = { + .notifier_call = err_inject_cpu_callback, +}; + +static int err_inject_init(void) +{ + err_inject_cpu_notifier.priority = priority; + + return register_hotcpu_notifier(&err_inject_cpu_notifier); +} + +static void err_inject_exit(void) +{ + unregister_hotcpu_notifier(&err_inject_cpu_notifier); +} + +module_init(err_inject_init); +module_exit(err_inject_exit); + +MODULE_DESCRIPTION("CPU notifier error injection module"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Akinobu Mita <akinobu.mita@gmail.com>"); diff --git a/lib/cpumask.c b/lib/cpumask.c index 7bb4142a502f..05d6aca7fc19 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -1,3 +1,4 @@ +#include <linux/slab.h> #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/cpumask.h> diff --git a/lib/crc32.c b/lib/crc32.c index 02e3b31b3a79..4855995fcde9 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -25,16 +25,19 @@ #include <linux/module.h> #include <linux/compiler.h> #include <linux/types.h> -#include <linux/slab.h> #include <linux/init.h> #include <asm/atomic.h> #include "crc32defs.h" #if CRC_LE_BITS == 8 -#define tole(x) __constant_cpu_to_le32(x) -#define tobe(x) __constant_cpu_to_be32(x) +# define tole(x) __constant_cpu_to_le32(x) #else -#define tole(x) (x) -#define tobe(x) (x) +# define tole(x) (x) +#endif + +#if CRC_BE_BITS == 8 +# define tobe(x) __constant_cpu_to_be32(x) +#else +# define tobe(x) (x) #endif #include "crc32table.h" @@ -45,33 +48,37 @@ MODULE_LICENSE("GPL"); #if CRC_LE_BITS == 8 || CRC_BE_BITS == 8 static inline u32 -crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 *tab) +crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) { # ifdef __LITTLE_ENDIAN -# define DO_CRC(x) crc = tab[(crc ^ (x)) & 255 ] ^ (crc >> 8) +# define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8) +# define DO_CRC4 crc = tab[3][(crc) & 255] ^ \ + tab[2][(crc >> 8) & 255] ^ \ + tab[1][(crc >> 16) & 255] ^ \ + tab[0][(crc >> 24) & 255] # else -# define DO_CRC(x) crc = tab[((crc >> 24) ^ (x)) & 255] ^ (crc << 8) +# define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8) +# define DO_CRC4 crc = tab[0][(crc) & 255] ^ \ + tab[1][(crc >> 8) & 255] ^ \ + tab[2][(crc >> 16) & 255] ^ \ + tab[3][(crc >> 24) & 255] # endif - const u32 *b = (const u32 *)buf; + const u32 *b; size_t rem_len; /* Align it */ - if (unlikely((long)b & 3 && len)) { - u8 *p = (u8 *)b; + if (unlikely((long)buf & 3 && len)) { do { - DO_CRC(*p++); - } while ((--len) && ((long)p)&3); - b = (u32 *)p; + DO_CRC(*buf++); + } while ((--len) && ((long)buf)&3); } rem_len = len & 3; /* load data 32 bits wide, xor data 32 bits wide. */ len = len >> 2; + b = (const u32 *)buf; for (--b; len; --len) { crc ^= *++b; /* use pre increment for speed */ - DO_CRC(0); - DO_CRC(0); - DO_CRC(0); - DO_CRC(0); + DO_CRC4; } len = rem_len; /* And the last few bytes */ @@ -82,6 +89,8 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 *tab) } while (--len); } return crc; +#undef DO_CRC +#undef DO_CRC4 } #endif /** @@ -114,14 +123,11 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) { # if CRC_LE_BITS == 8 - const u32 *tab = crc32table_le; + const u32 (*tab)[] = crc32table_le; crc = __cpu_to_le32(crc); crc = crc32_body(crc, p, len, tab); return __le32_to_cpu(crc); -#undef ENDIAN_SHIFT -#undef DO_CRC - # elif CRC_LE_BITS == 4 while (len--) { crc ^= *p++; @@ -174,14 +180,11 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) { # if CRC_BE_BITS == 8 - const u32 *tab = crc32table_be; + const u32 (*tab)[] = crc32table_be; crc = __cpu_to_be32(crc); crc = crc32_body(crc, p, len, tab); return __be32_to_cpu(crc); -#undef ENDIAN_SHIFT -#undef DO_CRC - # elif CRC_BE_BITS == 4 while (len--) { crc ^= *p++ << 24; diff --git a/lib/debug_locks.c b/lib/debug_locks.c index bc3b11731b9c..b1c177307677 100644 --- a/lib/debug_locks.c +++ b/lib/debug_locks.c @@ -8,7 +8,6 @@ * * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> */ -#include <linux/kernel.h> #include <linux/rwsem.h> #include <linux/mutex.h> #include <linux/module.h> @@ -23,6 +22,7 @@ * shut up after that. */ int debug_locks = 1; +EXPORT_SYMBOL_GPL(debug_locks); /* * The locking-testsuite uses <debug_locks_silent> to get a @@ -38,7 +38,6 @@ int debug_locks_off(void) { if (__debug_locks_off()) { if (!debug_locks_silent) { - oops_in_progress = 1; console_verbose(); return 1; } diff --git a/lib/debugobjects.c b/lib/debugobjects.c index a9a8996d286a..deebcc57d4e6 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -12,6 +12,7 @@ #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/debugfs.h> +#include <linux/slab.h> #include <linux/hash.h> #define ODEBUG_HASH_BITS 14 @@ -140,6 +141,7 @@ alloc_object(void *addr, struct debug_bucket *b, struct debug_obj_descr *descr) obj->object = addr; obj->descr = descr; obj->state = ODEBUG_STATE_NONE; + obj->astate = 0; hlist_del(&obj->node); hlist_add_head(&obj->node, &b->list); @@ -251,8 +253,10 @@ static void debug_print_object(struct debug_obj *obj, char *msg) if (limit < 5 && obj->descr != descr_test) { limit++; - WARN(1, KERN_ERR "ODEBUG: %s %s object type: %s\n", msg, - obj_states[obj->state], obj->descr->name); + WARN(1, KERN_ERR "ODEBUG: %s %s (active state %u) " + "object type: %s\n", + msg, obj_states[obj->state], obj->astate, + obj->descr->name); } debug_objects_warnings++; } @@ -446,7 +450,10 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr) case ODEBUG_STATE_INIT: case ODEBUG_STATE_INACTIVE: case ODEBUG_STATE_ACTIVE: - obj->state = ODEBUG_STATE_INACTIVE; + if (!obj->astate) + obj->state = ODEBUG_STATE_INACTIVE; + else + debug_print_object(obj, "deactivate"); break; case ODEBUG_STATE_DESTROYED: @@ -552,6 +559,53 @@ out_unlock: raw_spin_unlock_irqrestore(&db->lock, flags); } +/** + * debug_object_active_state - debug checks object usage state machine + * @addr: address of the object + * @descr: pointer to an object specific debug description structure + * @expect: expected state + * @next: state to move to if expected state is found + */ +void +debug_object_active_state(void *addr, struct debug_obj_descr *descr, + unsigned int expect, unsigned int next) +{ + struct debug_bucket *db; + struct debug_obj *obj; + unsigned long flags; + + if (!debug_objects_enabled) + return; + + db = get_bucket((unsigned long) addr); + + raw_spin_lock_irqsave(&db->lock, flags); + + obj = lookup_object(addr, db); + if (obj) { + switch (obj->state) { + case ODEBUG_STATE_ACTIVE: + if (obj->astate == expect) + obj->astate = next; + else + debug_print_object(obj, "active_state"); + break; + + default: + debug_print_object(obj, "active_state"); + break; + } + } else { + struct debug_obj o = { .object = addr, + .state = ODEBUG_STATE_NOTAVAILABLE, + .descr = descr }; + + debug_print_object(&o, "active_state"); + } + + raw_spin_unlock_irqrestore(&db->lock, flags); +} + #ifdef CONFIG_DEBUG_OBJECTS_FREE static void __debug_check_no_obj_freed(const void *address, unsigned long size) { @@ -773,7 +827,7 @@ static int __init fixup_free(void *addr, enum debug_obj_state state) } } -static int +static int __init check_results(void *addr, enum debug_obj_state state, int fixups, int warnings) { struct debug_bucket *db; @@ -916,7 +970,7 @@ void __init debug_objects_early_init(void) /* * Convert the statically allocated objects to dynamic ones: */ -static int debug_objects_replace_static_objects(void) +static int __init debug_objects_replace_static_objects(void) { struct debug_bucket *db = obj_hash; struct hlist_node *node, *tmp; diff --git a/lib/decompress.c b/lib/decompress.c index a7606815541f..3d766b7f60ab 100644 --- a/lib/decompress.c +++ b/lib/decompress.c @@ -8,6 +8,7 @@ #include <linux/decompress/bunzip2.h> #include <linux/decompress/unlzma.h> +#include <linux/decompress/unxz.h> #include <linux/decompress/inflate.h> #include <linux/decompress/unlzo.h> @@ -23,6 +24,9 @@ #ifndef CONFIG_DECOMPRESS_LZMA # define unlzma NULL #endif +#ifndef CONFIG_DECOMPRESS_XZ +# define unxz NULL +#endif #ifndef CONFIG_DECOMPRESS_LZO # define unlzo NULL #endif @@ -36,6 +40,7 @@ static const struct compress_format { { {037, 0236}, "gzip", gunzip }, { {0x42, 0x5a}, "bzip2", bunzip2 }, { {0x5d, 0x00}, "lzma", unlzma }, + { {0xfd, 0x37}, "xz", unxz }, { {0x89, 0x4c}, "lzo", unlzo }, { {0, 0}, NULL, NULL } }; diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c index a4e971dee102..a7b80c1d6a0d 100644 --- a/lib/decompress_bunzip2.c +++ b/lib/decompress_bunzip2.c @@ -49,7 +49,6 @@ #define PREBOOT #else #include <linux/decompress/bunzip2.h> -#include <linux/slab.h> #endif /* STATIC */ #include <linux/decompress/mm.h> @@ -107,6 +106,8 @@ struct bunzip_data { unsigned char selectors[32768]; /* nSelectors = 15 bits */ struct group_data groups[MAX_GROUPS]; /* Huffman coding tables */ int io_error; /* non-zero if we have IO error */ + int byteCount[256]; + unsigned char symToByte[256], mtfSymbol[256]; }; @@ -158,14 +159,16 @@ static int INIT get_next_block(struct bunzip_data *bd) int *base = NULL; int *limit = NULL; int dbufCount, nextSym, dbufSize, groupCount, selector, - i, j, k, t, runPos, symCount, symTotal, nSelectors, - byteCount[256]; - unsigned char uc, symToByte[256], mtfSymbol[256], *selectors; + i, j, k, t, runPos, symCount, symTotal, nSelectors, *byteCount; + unsigned char uc, *symToByte, *mtfSymbol, *selectors; unsigned int *dbuf, origPtr; dbuf = bd->dbuf; dbufSize = bd->dbufSize; selectors = bd->selectors; + byteCount = bd->byteCount; + symToByte = bd->symToByte; + mtfSymbol = bd->mtfSymbol; /* Read in header signature and CRC, then validate signature. (last block signature means CRC is for whole file, return now) */ @@ -678,13 +681,12 @@ STATIC int INIT bunzip2(unsigned char *buf, int len, int(*flush)(void*, unsigned int), unsigned char *outbuf, int *pos, - void(*error_fn)(char *x)) + void(*error)(char *x)) { struct bunzip_data *bd; int i = -1; unsigned char *inbuf; - set_error_fn(error_fn); if (flush) outbuf = malloc(BZIP2_IOBUF_SIZE); @@ -747,8 +749,8 @@ STATIC int INIT decompress(unsigned char *buf, int len, int(*flush)(void*, unsigned int), unsigned char *outbuf, int *pos, - void(*error_fn)(char *x)) + void(*error)(char *x)) { - return bunzip2(buf, len - 4, fill, flush, outbuf, pos, error_fn); + return bunzip2(buf, len - 4, fill, flush, outbuf, pos, error); } #endif diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c index fc686c7a0a0d..19ff89e34eec 100644 --- a/lib/decompress_inflate.c +++ b/lib/decompress_inflate.c @@ -19,7 +19,6 @@ #include "zlib_inflate/inflate.h" #include "zlib_inflate/infutil.h" -#include <linux/slab.h> #endif /* STATIC */ @@ -27,7 +26,7 @@ #define GZIP_IOBUF_SIZE (16*1024) -static int nofill(void *buffer, unsigned int len) +static int INIT nofill(void *buffer, unsigned int len) { return -1; } @@ -38,13 +37,12 @@ STATIC int INIT gunzip(unsigned char *buf, int len, int(*flush)(void*, unsigned int), unsigned char *out_buf, int *pos, - void(*error_fn)(char *x)) { + void(*error)(char *x)) { u8 *zbuf; struct z_stream_s *strm; int rc; size_t out_len; - set_error_fn(error_fn); rc = -1; if (flush) { out_len = 0x8000; /* 32 K */ @@ -100,13 +98,22 @@ STATIC int INIT gunzip(unsigned char *buf, int len, * possible asciz filename) */ strm->next_in = zbuf + 10; + strm->avail_in = len - 10; /* skip over asciz filename */ if (zbuf[3] & 0x8) { - while (strm->next_in[0]) - strm->next_in++; - strm->next_in++; + do { + /* + * If the filename doesn't fit into the buffer, + * the file is very probably corrupt. Don't try + * to read more data. + */ + if (strm->avail_in == 0) { + error("header error"); + goto gunzip_5; + } + --strm->avail_in; + } while (*strm->next_in++); } - strm->avail_in = len - (strm->next_in - zbuf); strm->next_out = out_buf; strm->avail_out = out_len; diff --git a/lib/decompress_unlzma.c b/lib/decompress_unlzma.c index ca82fde81c8f..476c65af9709 100644 --- a/lib/decompress_unlzma.c +++ b/lib/decompress_unlzma.c @@ -33,7 +33,6 @@ #define PREBOOT #else #include <linux/decompress/unlzma.h> -#include <linux/slab.h> #endif /* STATIC */ #include <linux/decompress/mm.h> @@ -74,6 +73,7 @@ struct rc { uint32_t code; uint32_t range; uint32_t bound; + void (*error)(char *); }; @@ -82,7 +82,7 @@ struct rc { #define RC_MODEL_TOTAL_BITS 11 -static int nofill(void *buffer, unsigned int len) +static int INIT nofill(void *buffer, unsigned int len) { return -1; } @@ -92,7 +92,7 @@ static void INIT rc_read(struct rc *rc) { rc->buffer_size = rc->fill((char *)rc->buffer, LZMA_IOBUF_SIZE); if (rc->buffer_size <= 0) - error("unexpected EOF"); + rc->error("unexpected EOF"); rc->ptr = rc->buffer; rc->buffer_end = rc->buffer + rc->buffer_size; } @@ -127,12 +127,6 @@ static inline void INIT rc_init_code(struct rc *rc) } -/* Called once. TODO: bb_maybe_free() */ -static inline void INIT rc_free(struct rc *rc) -{ - free(rc->buffer); -} - /* Called twice, but one callsite is in inline'd rc_is_bit_0_helper() */ static void INIT rc_do_normalize(struct rc *rc) { @@ -169,7 +163,7 @@ static inline void INIT rc_update_bit_0(struct rc *rc, uint16_t *p) rc->range = rc->bound; *p += ((1 << RC_MODEL_TOTAL_BITS) - *p) >> RC_MOVE_BITS; } -static inline void rc_update_bit_1(struct rc *rc, uint16_t *p) +static inline void INIT rc_update_bit_1(struct rc *rc, uint16_t *p) { rc->range -= rc->bound; rc->code -= rc->bound; @@ -319,32 +313,38 @@ static inline uint8_t INIT peek_old_byte(struct writer *wr, } -static inline void INIT write_byte(struct writer *wr, uint8_t byte) +static inline int INIT write_byte(struct writer *wr, uint8_t byte) { wr->buffer[wr->buffer_pos++] = wr->previous_byte = byte; if (wr->flush && wr->buffer_pos == wr->header->dict_size) { wr->buffer_pos = 0; wr->global_pos += wr->header->dict_size; - wr->flush((char *)wr->buffer, wr->header->dict_size); + if (wr->flush((char *)wr->buffer, wr->header->dict_size) + != wr->header->dict_size) + return -1; } + return 0; } -static inline void INIT copy_byte(struct writer *wr, uint32_t offs) +static inline int INIT copy_byte(struct writer *wr, uint32_t offs) { - write_byte(wr, peek_old_byte(wr, offs)); + return write_byte(wr, peek_old_byte(wr, offs)); } -static inline void INIT copy_bytes(struct writer *wr, +static inline int INIT copy_bytes(struct writer *wr, uint32_t rep0, int len) { do { - copy_byte(wr, rep0); + if (copy_byte(wr, rep0)) + return -1; len--; } while (len != 0 && wr->buffer_pos < wr->header->dst_size); + + return len; } -static inline void INIT process_bit0(struct writer *wr, struct rc *rc, +static inline int INIT process_bit0(struct writer *wr, struct rc *rc, struct cstate *cst, uint16_t *p, int pos_state, uint16_t *prob, int lc, uint32_t literal_pos_mask) { @@ -378,16 +378,17 @@ static inline void INIT process_bit0(struct writer *wr, struct rc *rc, uint16_t *prob_lit = prob + mi; rc_get_bit(rc, prob_lit, &mi); } - write_byte(wr, mi); if (cst->state < 4) cst->state = 0; else if (cst->state < 10) cst->state -= 3; else cst->state -= 6; + + return write_byte(wr, mi); } -static inline void INIT process_bit1(struct writer *wr, struct rc *rc, +static inline int INIT process_bit1(struct writer *wr, struct rc *rc, struct cstate *cst, uint16_t *p, int pos_state, uint16_t *prob) { int offset; @@ -418,8 +419,7 @@ static inline void INIT process_bit1(struct writer *wr, struct rc *rc, cst->state = cst->state < LZMA_NUM_LIT_STATES ? 9 : 11; - copy_byte(wr, cst->rep0); - return; + return copy_byte(wr, cst->rep0); } else { rc_update_bit_1(rc, prob); } @@ -521,12 +521,15 @@ static inline void INIT process_bit1(struct writer *wr, struct rc *rc, } else cst->rep0 = pos_slot; if (++(cst->rep0) == 0) - return; + return 0; + if (cst->rep0 > wr->header->dict_size + || cst->rep0 > get_pos(wr)) + return -1; } len += LZMA_MATCH_MIN_LEN; - copy_bytes(wr, cst->rep0, len); + return copy_bytes(wr, cst->rep0, len); } @@ -536,7 +539,7 @@ STATIC inline int INIT unlzma(unsigned char *buf, int in_len, int(*flush)(void*, unsigned int), unsigned char *output, int *posp, - void(*error_fn)(char *x) + void(*error)(char *x) ) { struct lzma_header header; @@ -552,7 +555,7 @@ STATIC inline int INIT unlzma(unsigned char *buf, int in_len, unsigned char *inbuf; int ret = -1; - set_error_fn(error_fn); + rc.error = error; if (buf) inbuf = buf; @@ -580,8 +583,10 @@ STATIC inline int INIT unlzma(unsigned char *buf, int in_len, ((unsigned char *)&header)[i] = *rc.ptr++; } - if (header.pos >= (9 * 5 * 5)) + if (header.pos >= (9 * 5 * 5)) { error("bad header"); + goto exit_1; + } mi = 0; lc = header.pos; @@ -627,21 +632,29 @@ STATIC inline int INIT unlzma(unsigned char *buf, int in_len, int pos_state = get_pos(&wr) & pos_state_mask; uint16_t *prob = p + LZMA_IS_MATCH + (cst.state << LZMA_NUM_POS_BITS_MAX) + pos_state; - if (rc_is_bit_0(&rc, prob)) - process_bit0(&wr, &rc, &cst, p, pos_state, prob, - lc, literal_pos_mask); - else { - process_bit1(&wr, &rc, &cst, p, pos_state, prob); + if (rc_is_bit_0(&rc, prob)) { + if (process_bit0(&wr, &rc, &cst, p, pos_state, prob, + lc, literal_pos_mask)) { + error("LZMA data is corrupt"); + goto exit_3; + } + } else { + if (process_bit1(&wr, &rc, &cst, p, pos_state, prob)) { + error("LZMA data is corrupt"); + goto exit_3; + } if (cst.rep0 == 0) break; } + if (rc.buffer_size <= 0) + goto exit_3; } if (posp) *posp = rc.ptr-rc.buffer; - if (wr.flush) - wr.flush(wr.buffer, wr.buffer_pos); - ret = 0; + if (!wr.flush || wr.flush(wr.buffer, wr.buffer_pos) == wr.buffer_pos) + ret = 0; +exit_3: large_free(p); exit_2: if (!output) @@ -659,9 +672,9 @@ STATIC int INIT decompress(unsigned char *buf, int in_len, int(*flush)(void*, unsigned int), unsigned char *output, int *posp, - void(*error_fn)(char *x) + void(*error)(char *x) ) { - return unlzma(buf, in_len - 4, fill, flush, output, posp, error_fn); + return unlzma(buf, in_len - 4, fill, flush, output, posp, error); } #endif diff --git a/lib/decompress_unlzo.c b/lib/decompress_unlzo.c index db521f45626e..5a7a2adf4c4c 100644 --- a/lib/decompress_unlzo.c +++ b/lib/decompress_unlzo.c @@ -33,7 +33,6 @@ #ifdef STATIC #include "lzo/lzo1x_decompress.c" #else -#include <linux/slab.h> #include <linux/decompress/unlzo.h> #endif @@ -49,14 +48,25 @@ static const unsigned char lzop_magic[] = { #define LZO_BLOCK_SIZE (256*1024l) #define HEADER_HAS_FILTER 0x00000800L +#define HEADER_SIZE_MIN (9 + 7 + 4 + 8 + 1 + 4) +#define HEADER_SIZE_MAX (9 + 7 + 1 + 8 + 8 + 4 + 1 + 255 + 4) -STATIC inline int INIT parse_header(u8 *input, u8 *skip) +STATIC inline int INIT parse_header(u8 *input, int *skip, int in_len) { int l; u8 *parse = input; + u8 *end = input + in_len; u8 level = 0; u16 version; + /* + * Check that there's enough input to possibly have a valid header. + * Then it is possible to parse several fields until the minimum + * size may have been used. + */ + if (in_len < HEADER_SIZE_MIN) + return 0; + /* read magic: 9 first bits */ for (l = 0; l < 9; l++) { if (*parse++ != lzop_magic[l]) @@ -74,6 +84,15 @@ STATIC inline int INIT parse_header(u8 *input, u8 *skip) else parse += 4; /* flags */ + /* + * At least mode, mtime_low, filename length, and checksum must + * be left to be parsed. If also mtime_high is present, it's OK + * because the next input buffer check is after reading the + * filename length. + */ + if (end - parse < 8 + 1 + 4) + return 0; + /* skip mode and mtime_low */ parse += 8; if (version >= 0x0940) @@ -81,6 +100,8 @@ STATIC inline int INIT parse_header(u8 *input, u8 *skip) l = *parse++; /* don't care about the file name, and skip checksum */ + if (end - parse < l + 4) + return 0; parse += l + 4; *skip = parse - input; @@ -91,15 +112,14 @@ STATIC inline int INIT unlzo(u8 *input, int in_len, int (*fill) (void *, unsigned int), int (*flush) (void *, unsigned int), u8 *output, int *posp, - void (*error_fn) (char *x)) + void (*error) (char *x)) { - u8 skip = 0, r = 0; + u8 r = 0; + int skip = 0; u32 src_len, dst_len; size_t tmp; u8 *in_buf, *in_buf_save, *out_buf; - int obytes_processed = 0; - - set_error_fn(error_fn); + int ret = -1; if (output) { out_buf = output; @@ -119,8 +139,8 @@ STATIC inline int INIT unlzo(u8 *input, int in_len, goto exit_1; } else if (input) { in_buf = input; - } else if (!fill || !posp) { - error("NULL input pointer and missing position pointer or fill function"); + } else if (!fill) { + error("NULL input pointer and missing fill function"); goto exit_1; } else { in_buf = malloc(lzo1x_worst_compress(LZO_BLOCK_SIZE)); @@ -134,22 +154,47 @@ STATIC inline int INIT unlzo(u8 *input, int in_len, if (posp) *posp = 0; - if (fill) - fill(in_buf, lzo1x_worst_compress(LZO_BLOCK_SIZE)); + if (fill) { + /* + * Start from in_buf + HEADER_SIZE_MAX to make it possible + * to use memcpy() to copy the unused data to the beginning + * of the buffer. This way memmove() isn't needed which + * is missing from pre-boot environments of most archs. + */ + in_buf += HEADER_SIZE_MAX; + in_len = fill(in_buf, HEADER_SIZE_MAX); + } - if (!parse_header(input, &skip)) { + if (!parse_header(in_buf, &skip, in_len)) { error("invalid header"); goto exit_2; } in_buf += skip; + in_len -= skip; + + if (fill) { + /* Move the unused data to the beginning of the buffer. */ + memcpy(in_buf_save, in_buf, in_len); + in_buf = in_buf_save; + } if (posp) *posp = skip; for (;;) { /* read uncompressed block size */ + if (fill && in_len < 4) { + skip = fill(in_buf + in_len, 4 - in_len); + if (skip > 0) + in_len += skip; + } + if (in_len < 4) { + error("file corrupted"); + goto exit_2; + } dst_len = get_unaligned_be32(in_buf); in_buf += 4; + in_len -= 4; /* exit if last block */ if (dst_len == 0) { @@ -164,8 +209,18 @@ STATIC inline int INIT unlzo(u8 *input, int in_len, } /* read compressed block size, and skip block checksum info */ + if (fill && in_len < 8) { + skip = fill(in_buf + in_len, 8 - in_len); + if (skip > 0) + in_len += skip; + } + if (in_len < 8) { + error("file corrupted"); + goto exit_2; + } src_len = get_unaligned_be32(in_buf); in_buf += 8; + in_len -= 8; if (src_len <= 0 || src_len > dst_len) { error("file corrupted"); @@ -173,29 +228,55 @@ STATIC inline int INIT unlzo(u8 *input, int in_len, } /* decompress */ + if (fill && in_len < src_len) { + skip = fill(in_buf + in_len, src_len - in_len); + if (skip > 0) + in_len += skip; + } + if (in_len < src_len) { + error("file corrupted"); + goto exit_2; + } tmp = dst_len; - r = lzo1x_decompress_safe((u8 *) in_buf, src_len, + + /* When the input data is not compressed at all, + * lzo1x_decompress_safe will fail, so call memcpy() + * instead */ + if (unlikely(dst_len == src_len)) + memcpy(out_buf, in_buf, src_len); + else { + r = lzo1x_decompress_safe((u8 *) in_buf, src_len, out_buf, &tmp); - if (r != LZO_E_OK || dst_len != tmp) { - error("Compressed data violation"); - goto exit_2; + if (r != LZO_E_OK || dst_len != tmp) { + error("Compressed data violation"); + goto exit_2; + } } - obytes_processed += dst_len; - if (flush) - flush(out_buf, dst_len); + if (flush && flush(out_buf, dst_len) != dst_len) + goto exit_2; if (output) out_buf += dst_len; if (posp) *posp += src_len + 12; + + in_buf += src_len; + in_len -= src_len; if (fill) { + /* + * If there happens to still be unused data left in + * in_buf, move it to the beginning of the buffer. + * Use a loop to avoid memmove() dependency. + */ + if (in_len > 0) + for (skip = 0; skip < in_len; ++skip) + in_buf_save[skip] = in_buf[skip]; in_buf = in_buf_save; - fill(in_buf, lzo1x_worst_compress(LZO_BLOCK_SIZE)); - } else - in_buf += src_len; + } } + ret = 0; exit_2: if (!input) free(in_buf); @@ -203,7 +284,7 @@ exit_1: if (!output) free(out_buf); exit: - return obytes_processed; + return ret; } #define decompress unlzo diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c new file mode 100644 index 000000000000..cecd23df2b9a --- /dev/null +++ b/lib/decompress_unxz.c @@ -0,0 +1,397 @@ +/* + * Wrapper for decompressing XZ-compressed kernel, initramfs, and initrd + * + * Author: Lasse Collin <lasse.collin@tukaani.org> + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +/* + * Important notes about in-place decompression + * + * At least on x86, the kernel is decompressed in place: the compressed data + * is placed to the end of the output buffer, and the decompressor overwrites + * most of the compressed data. There must be enough safety margin to + * guarantee that the write position is always behind the read position. + * + * The safety margin for XZ with LZMA2 or BCJ+LZMA2 is calculated below. + * Note that the margin with XZ is bigger than with Deflate (gzip)! + * + * The worst case for in-place decompression is that the beginning of + * the file is compressed extremely well, and the rest of the file is + * uncompressible. Thus, we must look for worst-case expansion when the + * compressor is encoding uncompressible data. + * + * The structure of the .xz file in case of a compresed kernel is as follows. + * Sizes (as bytes) of the fields are in parenthesis. + * + * Stream Header (12) + * Block Header: + * Block Header (8-12) + * Compressed Data (N) + * Block Padding (0-3) + * CRC32 (4) + * Index (8-20) + * Stream Footer (12) + * + * Normally there is exactly one Block, but let's assume that there are + * 2-4 Blocks just in case. Because Stream Header and also Block Header + * of the first Block don't make the decompressor produce any uncompressed + * data, we can ignore them from our calculations. Block Headers of possible + * additional Blocks have to be taken into account still. With these + * assumptions, it is safe to assume that the total header overhead is + * less than 128 bytes. + * + * Compressed Data contains LZMA2 or BCJ+LZMA2 encoded data. Since BCJ + * doesn't change the size of the data, it is enough to calculate the + * safety margin for LZMA2. + * + * LZMA2 stores the data in chunks. Each chunk has a header whose size is + * a maximum of 6 bytes, but to get round 2^n numbers, let's assume that + * the maximum chunk header size is 8 bytes. After the chunk header, there + * may be up to 64 KiB of actual payload in the chunk. Often the payload is + * quite a bit smaller though; to be safe, let's assume that an average + * chunk has only 32 KiB of payload. + * + * The maximum uncompressed size of the payload is 2 MiB. The minimum + * uncompressed size of the payload is in practice never less than the + * payload size itself. The LZMA2 format would allow uncompressed size + * to be less than the payload size, but no sane compressor creates such + * files. LZMA2 supports storing uncompressible data in uncompressed form, + * so there's never a need to create payloads whose uncompressed size is + * smaller than the compressed size. + * + * The assumption, that the uncompressed size of the payload is never + * smaller than the payload itself, is valid only when talking about + * the payload as a whole. It is possible that the payload has parts where + * the decompressor consumes more input than it produces output. Calculating + * the worst case for this would be tricky. Instead of trying to do that, + * let's simply make sure that the decompressor never overwrites any bytes + * of the payload which it is currently reading. + * + * Now we have enough information to calculate the safety margin. We need + * - 128 bytes for the .xz file format headers; + * - 8 bytes per every 32 KiB of uncompressed size (one LZMA2 chunk header + * per chunk, each chunk having average payload size of 32 KiB); and + * - 64 KiB (biggest possible LZMA2 chunk payload size) to make sure that + * the decompressor never overwrites anything from the LZMA2 chunk + * payload it is currently reading. + * + * We get the following formula: + * + * safety_margin = 128 + uncompressed_size * 8 / 32768 + 65536 + * = 128 + (uncompressed_size >> 12) + 65536 + * + * For comparision, according to arch/x86/boot/compressed/misc.c, the + * equivalent formula for Deflate is this: + * + * safety_margin = 18 + (uncompressed_size >> 12) + 32768 + * + * Thus, when updating Deflate-only in-place kernel decompressor to + * support XZ, the fixed overhead has to be increased from 18+32768 bytes + * to 128+65536 bytes. + */ + +/* + * STATIC is defined to "static" if we are being built for kernel + * decompression (pre-boot code). <linux/decompress/mm.h> will define + * STATIC to empty if it wasn't already defined. Since we will need to + * know later if we are being used for kernel decompression, we define + * XZ_PREBOOT here. + */ +#ifdef STATIC +# define XZ_PREBOOT +#endif +#ifdef __KERNEL__ +# include <linux/decompress/mm.h> +#endif +#define XZ_EXTERN STATIC + +#ifndef XZ_PREBOOT +# include <linux/slab.h> +# include <linux/xz.h> +#else +/* + * Use the internal CRC32 code instead of kernel's CRC32 module, which + * is not available in early phase of booting. + */ +#define XZ_INTERNAL_CRC32 1 + +/* + * For boot time use, we enable only the BCJ filter of the current + * architecture or none if no BCJ filter is available for the architecture. + */ +#ifdef CONFIG_X86 +# define XZ_DEC_X86 +#endif +#ifdef CONFIG_PPC +# define XZ_DEC_POWERPC +#endif +#ifdef CONFIG_ARM +# define XZ_DEC_ARM +#endif +#ifdef CONFIG_IA64 +# define XZ_DEC_IA64 +#endif +#ifdef CONFIG_SPARC +# define XZ_DEC_SPARC +#endif + +/* + * This will get the basic headers so that memeq() and others + * can be defined. + */ +#include "xz/xz_private.h" + +/* + * Replace the normal allocation functions with the versions from + * <linux/decompress/mm.h>. vfree() needs to support vfree(NULL) + * when XZ_DYNALLOC is used, but the pre-boot free() doesn't support it. + * Workaround it here because the other decompressors don't need it. + */ +#undef kmalloc +#undef kfree +#undef vmalloc +#undef vfree +#define kmalloc(size, flags) malloc(size) +#define kfree(ptr) free(ptr) +#define vmalloc(size) malloc(size) +#define vfree(ptr) do { if (ptr != NULL) free(ptr); } while (0) + +/* + * FIXME: Not all basic memory functions are provided in architecture-specific + * files (yet). We define our own versions here for now, but this should be + * only a temporary solution. + * + * memeq and memzero are not used much and any remotely sane implementation + * is fast enough. memcpy/memmove speed matters in multi-call mode, but + * the kernel image is decompressed in single-call mode, in which only + * memcpy speed can matter and only if there is a lot of uncompressible data + * (LZMA2 stores uncompressible chunks in uncompressed form). Thus, the + * functions below should just be kept small; it's probably not worth + * optimizing for speed. + */ + +#ifndef memeq +static bool memeq(const void *a, const void *b, size_t size) +{ + const uint8_t *x = a; + const uint8_t *y = b; + size_t i; + + for (i = 0; i < size; ++i) + if (x[i] != y[i]) + return false; + + return true; +} +#endif + +#ifndef memzero +static void memzero(void *buf, size_t size) +{ + uint8_t *b = buf; + uint8_t *e = b + size; + + while (b != e) + *b++ = '\0'; +} +#endif + +#ifndef memmove +/* Not static to avoid a conflict with the prototype in the Linux headers. */ +void *memmove(void *dest, const void *src, size_t size) +{ + uint8_t *d = dest; + const uint8_t *s = src; + size_t i; + + if (d < s) { + for (i = 0; i < size; ++i) + d[i] = s[i]; + } else if (d > s) { + i = size; + while (i-- > 0) + d[i] = s[i]; + } + + return dest; +} +#endif + +/* + * Since we need memmove anyway, would use it as memcpy too. + * Commented out for now to avoid breaking things. + */ +/* +#ifndef memcpy +# define memcpy memmove +#endif +*/ + +#include "xz/xz_crc32.c" +#include "xz/xz_dec_stream.c" +#include "xz/xz_dec_lzma2.c" +#include "xz/xz_dec_bcj.c" + +#endif /* XZ_PREBOOT */ + +/* Size of the input and output buffers in multi-call mode */ +#define XZ_IOBUF_SIZE 4096 + +/* + * This function implements the API defined in <linux/decompress/generic.h>. + * + * This wrapper will automatically choose single-call or multi-call mode + * of the native XZ decoder API. The single-call mode can be used only when + * both input and output buffers are available as a single chunk, i.e. when + * fill() and flush() won't be used. + */ +STATIC int INIT unxz(unsigned char *in, int in_size, + int (*fill)(void *dest, unsigned int size), + int (*flush)(void *src, unsigned int size), + unsigned char *out, int *in_used, + void (*error)(char *x)) +{ + struct xz_buf b; + struct xz_dec *s; + enum xz_ret ret; + bool must_free_in = false; + +#if XZ_INTERNAL_CRC32 + xz_crc32_init(); +#endif + + if (in_used != NULL) + *in_used = 0; + + if (fill == NULL && flush == NULL) + s = xz_dec_init(XZ_SINGLE, 0); + else + s = xz_dec_init(XZ_DYNALLOC, (uint32_t)-1); + + if (s == NULL) + goto error_alloc_state; + + if (flush == NULL) { + b.out = out; + b.out_size = (size_t)-1; + } else { + b.out_size = XZ_IOBUF_SIZE; + b.out = malloc(XZ_IOBUF_SIZE); + if (b.out == NULL) + goto error_alloc_out; + } + + if (in == NULL) { + must_free_in = true; + in = malloc(XZ_IOBUF_SIZE); + if (in == NULL) + goto error_alloc_in; + } + + b.in = in; + b.in_pos = 0; + b.in_size = in_size; + b.out_pos = 0; + + if (fill == NULL && flush == NULL) { + ret = xz_dec_run(s, &b); + } else { + do { + if (b.in_pos == b.in_size && fill != NULL) { + if (in_used != NULL) + *in_used += b.in_pos; + + b.in_pos = 0; + + in_size = fill(in, XZ_IOBUF_SIZE); + if (in_size < 0) { + /* + * This isn't an optimal error code + * but it probably isn't worth making + * a new one either. + */ + ret = XZ_BUF_ERROR; + break; + } + + b.in_size = in_size; + } + + ret = xz_dec_run(s, &b); + + if (flush != NULL && (b.out_pos == b.out_size + || (ret != XZ_OK && b.out_pos > 0))) { + /* + * Setting ret here may hide an error + * returned by xz_dec_run(), but probably + * it's not too bad. + */ + if (flush(b.out, b.out_pos) != (int)b.out_pos) + ret = XZ_BUF_ERROR; + + b.out_pos = 0; + } + } while (ret == XZ_OK); + + if (must_free_in) + free(in); + + if (flush != NULL) + free(b.out); + } + + if (in_used != NULL) + *in_used += b.in_pos; + + xz_dec_end(s); + + switch (ret) { + case XZ_STREAM_END: + return 0; + + case XZ_MEM_ERROR: + /* This can occur only in multi-call mode. */ + error("XZ decompressor ran out of memory"); + break; + + case XZ_FORMAT_ERROR: + error("Input is not in the XZ format (wrong magic bytes)"); + break; + + case XZ_OPTIONS_ERROR: + error("Input was encoded with settings that are not " + "supported by this XZ decoder"); + break; + + case XZ_DATA_ERROR: + case XZ_BUF_ERROR: + error("XZ-compressed data is corrupt"); + break; + + default: + error("Bug in the XZ decompressor"); + break; + } + + return -1; + +error_alloc_in: + if (flush != NULL) + free(b.out); + +error_alloc_out: + xz_dec_end(s); + +error_alloc_state: + error("XZ decompressor ran out of memory"); + return -1; +} + +/* + * This macro is used by architecture-specific files to decompress + * the kernel image. + */ +#define decompress unxz diff --git a/lib/devres.c b/lib/devres.c index 72c8909006da..6efddf53b90c 100644 --- a/lib/devres.c +++ b/lib/devres.c @@ -1,5 +1,6 @@ #include <linux/pci.h> #include <linux/io.h> +#include <linux/gfp.h> #include <linux/module.h> void devm_ioremap_release(struct device *dev, void *res) @@ -327,7 +328,7 @@ EXPORT_SYMBOL(pcim_iomap_regions_request_all); * @pdev: PCI device to map IO resources for * @mask: Mask of BARs to unmap and release * - * Unamp and release regions specified by @mask. + * Unmap and release regions specified by @mask. */ void pcim_iounmap_regions(struct pci_dev *pdev, u16 mask) { diff --git a/lib/div64.c b/lib/div64.c index a111eb8de9cf..5b4919191778 100644 --- a/lib/div64.c +++ b/lib/div64.c @@ -77,26 +77,58 @@ s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder) EXPORT_SYMBOL(div_s64_rem); #endif -/* 64bit divisor, dividend and result. dynamic precision */ +/** + * div64_u64 - unsigned 64bit divide with 64bit divisor + * @dividend: 64bit dividend + * @divisor: 64bit divisor + * + * This implementation is a modified version of the algorithm proposed + * by the book 'Hacker's Delight'. The original source and full proof + * can be found here and is available for use without restriction. + * + * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c' + */ #ifndef div64_u64 u64 div64_u64(u64 dividend, u64 divisor) { - u32 high, d; + u32 high = divisor >> 32; + u64 quot; - high = divisor >> 32; - if (high) { - unsigned int shift = fls(high); + if (high == 0) { + quot = div_u64(dividend, divisor); + } else { + int n = 1 + fls(high); + quot = div_u64(dividend >> n, divisor >> n); - d = divisor >> shift; - dividend >>= shift; - } else - d = divisor; + if (quot != 0) + quot--; + if ((dividend - quot * divisor) >= divisor) + quot++; + } - return div_u64(dividend, d); + return quot; } EXPORT_SYMBOL(div64_u64); #endif +/** + * div64_s64 - signed 64bit divide with 64bit divisor + * @dividend: 64bit dividend + * @divisor: 64bit divisor + */ +#ifndef div64_s64 +s64 div64_s64(s64 dividend, s64 divisor) +{ + s64 quot, t; + + quot = div64_u64(abs64(dividend), abs64(divisor)); + t = (dividend ^ divisor) >> 63; + + return (quot ^ t) - t; +} +EXPORT_SYMBOL(div64_s64); +#endif + #endif /* BITS_PER_LONG == 32 */ /* diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 7d2f0b33e5a8..4bfb0471f106 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -570,7 +570,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf, * Now parse out the first token and use it as the name for the * driver to filter for. */ - for (i = 0; i < NAME_MAX_LEN; ++i) { + for (i = 0; i < NAME_MAX_LEN - 1; ++i) { current_driver_name[i] = buf[i]; if (isspace(buf[i]) || buf[i] == ' ' || buf[i] == 0) break; @@ -587,9 +587,10 @@ out_unlock: return count; } -const struct file_operations filter_fops = { +static const struct file_operations filter_fops = { .read = filter_read, .write = filter_write, + .llseek = default_llseek, }; static int dma_debug_fs_init(void) diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index f93502915988..b335acb43be2 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -25,19 +25,12 @@ #include <linux/uaccess.h> #include <linux/dynamic_debug.h> #include <linux/debugfs.h> +#include <linux/slab.h> +#include <linux/jump_label.h> extern struct _ddebug __start___verbose[]; extern struct _ddebug __stop___verbose[]; -/* dynamic_debug_enabled, and dynamic_debug_enabled2 are bitmasks in which - * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They - * use independent hash functions, to reduce the chance of false positives. - */ -long long dynamic_debug_enabled; -EXPORT_SYMBOL_GPL(dynamic_debug_enabled); -long long dynamic_debug_enabled2; -EXPORT_SYMBOL_GPL(dynamic_debug_enabled2); - struct ddebug_table { struct list_head link; char *mod_name; @@ -87,26 +80,6 @@ static char *ddebug_describe_flags(struct _ddebug *dp, char *buf, } /* - * must be called with ddebug_lock held - */ - -static int disabled_hash(char hash, bool first_table) -{ - struct ddebug_table *dt; - char table_hash_value; - - list_for_each_entry(dt, &ddebug_tables, link) { - if (first_table) - table_hash_value = dt->ddebugs->primary_hash; - else - table_hash_value = dt->ddebugs->secondary_hash; - if (dt->num_enabled && (hash == table_hash_value)) - return 0; - } - return 1; -} - -/* * Search the tables for _ddebug's which match the given * `query' and apply the `flags' and `mask' to them. Tells * the user which ddebug's were changed, or whether none @@ -168,19 +141,10 @@ static void ddebug_change(const struct ddebug_query *query, else if (!dp->flags) dt->num_enabled++; dp->flags = newflags; - if (newflags) { - dynamic_debug_enabled |= - (1LL << dp->primary_hash); - dynamic_debug_enabled2 |= - (1LL << dp->secondary_hash); - } else { - if (disabled_hash(dp->primary_hash, true)) - dynamic_debug_enabled &= - ~(1LL << dp->primary_hash); - if (disabled_hash(dp->secondary_hash, false)) - dynamic_debug_enabled2 &= - ~(1LL << dp->secondary_hash); - } + if (newflags) + dp->enabled = 1; + else + dp->enabled = 0; if (verbose) printk(KERN_INFO "ddebug: changed %s:%d [%s]%s %s\n", @@ -428,6 +392,40 @@ static int ddebug_parse_flags(const char *str, unsigned int *flagsp, return 0; } +static int ddebug_exec_query(char *query_string) +{ + unsigned int flags = 0, mask = 0; + struct ddebug_query query; +#define MAXWORDS 9 + int nwords; + char *words[MAXWORDS]; + + nwords = ddebug_tokenize(query_string, words, MAXWORDS); + if (nwords <= 0) + return -EINVAL; + if (ddebug_parse_query(words, nwords-1, &query)) + return -EINVAL; + if (ddebug_parse_flags(words[nwords-1], &flags, &mask)) + return -EINVAL; + + /* actually go and implement the change */ + ddebug_change(&query, flags, mask); + return 0; +} + +static __initdata char ddebug_setup_string[1024]; +static __init int ddebug_setup_query(char *str) +{ + if (strlen(str) >= 1024) { + pr_warning("ddebug boot param string too large\n"); + return 0; + } + strcpy(ddebug_setup_string, str); + return 1; +} + +__setup("ddebug_query=", ddebug_setup_query); + /* * File_ops->write method for <debugfs>/dynamic_debug/conrol. Gathers the * command text from userspace, parses and executes it. @@ -435,12 +433,8 @@ static int ddebug_parse_flags(const char *str, unsigned int *flagsp, static ssize_t ddebug_proc_write(struct file *file, const char __user *ubuf, size_t len, loff_t *offp) { - unsigned int flags = 0, mask = 0; - struct ddebug_query query; -#define MAXWORDS 9 - int nwords; - char *words[MAXWORDS]; char tmpbuf[256]; + int ret; if (len == 0) return 0; @@ -454,16 +448,9 @@ static ssize_t ddebug_proc_write(struct file *file, const char __user *ubuf, printk(KERN_INFO "%s: read %d bytes from userspace\n", __func__, (int)len); - nwords = ddebug_tokenize(tmpbuf, words, MAXWORDS); - if (nwords < 0) - return -EINVAL; - if (ddebug_parse_query(words, nwords-1, &query)) - return -EINVAL; - if (ddebug_parse_flags(words[nwords-1], &flags, &mask)) - return -EINVAL; - - /* actually go and implement the change */ - ddebug_change(&query, flags, mask); + ret = ddebug_exec_query(tmpbuf); + if (ret) + return ret; *offp += len; return len; @@ -691,7 +678,7 @@ static void ddebug_table_free(struct ddebug_table *dt) * Called in response to a module being unloaded. Removes * any ddebug_table's which point at the module. */ -int ddebug_remove_module(char *mod_name) +int ddebug_remove_module(const char *mod_name) { struct ddebug_table *dt, *nextdt; int ret = -ENOENT; @@ -724,13 +711,14 @@ static void ddebug_remove_all_tables(void) mutex_unlock(&ddebug_lock); } -static int __init dynamic_debug_init(void) +static __initdata int ddebug_init_success; + +static int __init dynamic_debug_init_debugfs(void) { struct dentry *dir, *file; - struct _ddebug *iter, *iter_start; - const char *modname = NULL; - int ret = 0; - int n = 0; + + if (!ddebug_init_success) + return -ENODEV; dir = debugfs_create_dir("dynamic_debug", NULL); if (!dir) @@ -741,6 +729,16 @@ static int __init dynamic_debug_init(void) debugfs_remove(dir); return -ENOMEM; } + return 0; +} + +static int __init dynamic_debug_init(void) +{ + struct _ddebug *iter, *iter_start; + const char *modname = NULL; + int ret = 0; + int n = 0; + if (__start___verbose != __stop___verbose) { iter = __start___verbose; modname = iter->modname; @@ -758,12 +756,26 @@ static int __init dynamic_debug_init(void) } ret = ddebug_add_module(iter_start, n, modname); } + + /* ddebug_query boot param got passed -> set it up */ + if (ddebug_setup_string[0] != '\0') { + ret = ddebug_exec_query(ddebug_setup_string); + if (ret) + pr_warning("Invalid ddebug boot param %s", + ddebug_setup_string); + else + pr_info("ddebug initialized with string %s", + ddebug_setup_string); + } + out_free: - if (ret) { + if (ret) ddebug_remove_all_tables(); - debugfs_remove(dir); - debugfs_remove(file); - } + else + ddebug_init_success = 1; return 0; } -module_init(dynamic_debug_init); +/* Allow early initialization for boot messages via boot param */ +arch_initcall(dynamic_debug_init); +/* Debugfs setup must be done later */ +module_init(dynamic_debug_init_debugfs); diff --git a/lib/flex_array.c b/lib/flex_array.c index 66eef2e4483e..c0ea40ba2082 100644 --- a/lib/flex_array.c +++ b/lib/flex_array.c @@ -23,6 +23,7 @@ #include <linux/flex_array.h> #include <linux/slab.h> #include <linux/stddef.h> +#include <linux/module.h> struct flex_array_part { char elements[FLEX_ARRAY_PART_SIZE]; @@ -99,10 +100,11 @@ struct flex_array *flex_array_alloc(int element_size, unsigned int total, ret->element_size = element_size; ret->total_nr_elements = total; if (elements_fit_in_base(ret) && !(flags & __GFP_ZERO)) - memset(ret->parts[0], FLEX_ARRAY_FREE, + memset(&ret->parts[0], FLEX_ARRAY_FREE, FLEX_ARRAY_BASE_BYTES_LEFT); return ret; } +EXPORT_SYMBOL(flex_array_alloc); static int fa_element_to_part_nr(struct flex_array *fa, unsigned int element_nr) @@ -126,12 +128,14 @@ void flex_array_free_parts(struct flex_array *fa) for (part_nr = 0; part_nr < FLEX_ARRAY_NR_BASE_PTRS; part_nr++) kfree(fa->parts[part_nr]); } +EXPORT_SYMBOL(flex_array_free_parts); void flex_array_free(struct flex_array *fa) { flex_array_free_parts(fa); kfree(fa); } +EXPORT_SYMBOL(flex_array_free); static unsigned int index_inside_part(struct flex_array *fa, unsigned int element_nr) @@ -171,6 +175,8 @@ __fa_get_part(struct flex_array *fa, int part_nr, gfp_t flags) * Note that this *copies* the contents of @src into * the array. If you are trying to store an array of * pointers, make sure to pass in &ptr instead of ptr. + * You may instead wish to use the flex_array_put_ptr() + * helper function. * * Locking must be provided by the caller. */ @@ -194,6 +200,7 @@ int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src, memcpy(dst, src, fa->element_size); return 0; } +EXPORT_SYMBOL(flex_array_put); /** * flex_array_clear - clear element in array at @element_nr @@ -221,6 +228,7 @@ int flex_array_clear(struct flex_array *fa, unsigned int element_nr) memset(dst, FLEX_ARRAY_FREE, fa->element_size); return 0; } +EXPORT_SYMBOL(flex_array_clear); /** * flex_array_prealloc - guarantee that array space exists @@ -257,6 +265,7 @@ int flex_array_prealloc(struct flex_array *fa, unsigned int start, } return 0; } +EXPORT_SYMBOL(flex_array_prealloc); /** * flex_array_get - pull data back out of the array @@ -265,7 +274,8 @@ int flex_array_prealloc(struct flex_array *fa, unsigned int start, * * Returns a pointer to the data at index @element_nr. Note * that this is a copy of the data that was passed in. If you - * are using this to store pointers, you'll get back &ptr. + * are using this to store pointers, you'll get back &ptr. You + * may instead wish to use the flex_array_get_ptr helper. * * Locking must be provided by the caller. */ @@ -285,6 +295,28 @@ void *flex_array_get(struct flex_array *fa, unsigned int element_nr) } return &part->elements[index_inside_part(fa, element_nr)]; } +EXPORT_SYMBOL(flex_array_get); + +/** + * flex_array_get_ptr - pull a ptr back out of the array + * @fa: the flex array from which to extract data + * @element_nr: index of the element to fetch from the array + * + * Returns the pointer placed in the flex array at element_nr using + * flex_array_put_ptr(). This function should not be called if the + * element in question was not set using the _put_ptr() helper. + */ +void *flex_array_get_ptr(struct flex_array *fa, unsigned int element_nr) +{ + void **tmp; + + tmp = flex_array_get(fa, element_nr); + if (!tmp) + return NULL; + + return *tmp; +} +EXPORT_SYMBOL(flex_array_get_ptr); static int part_is_free(struct flex_array_part *part) { @@ -325,3 +357,4 @@ int flex_array_shrink(struct flex_array *fa) } return ret; } +EXPORT_SYMBOL(flex_array_shrink); diff --git a/lib/gen_crc32table.c b/lib/gen_crc32table.c index bea5d97df991..85d0e412a04f 100644 --- a/lib/gen_crc32table.c +++ b/lib/gen_crc32table.c @@ -7,8 +7,8 @@ #define LE_TABLE_SIZE (1 << CRC_LE_BITS) #define BE_TABLE_SIZE (1 << CRC_BE_BITS) -static uint32_t crc32table_le[LE_TABLE_SIZE]; -static uint32_t crc32table_be[BE_TABLE_SIZE]; +static uint32_t crc32table_le[4][LE_TABLE_SIZE]; +static uint32_t crc32table_be[4][BE_TABLE_SIZE]; /** * crc32init_le() - allocate and initialize LE table data @@ -22,12 +22,19 @@ static void crc32init_le(void) unsigned i, j; uint32_t crc = 1; - crc32table_le[0] = 0; + crc32table_le[0][0] = 0; for (i = 1 << (CRC_LE_BITS - 1); i; i >>= 1) { crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0); for (j = 0; j < LE_TABLE_SIZE; j += 2 * i) - crc32table_le[i + j] = crc ^ crc32table_le[j]; + crc32table_le[0][i + j] = crc ^ crc32table_le[0][j]; + } + for (i = 0; i < LE_TABLE_SIZE; i++) { + crc = crc32table_le[0][i]; + for (j = 1; j < 4; j++) { + crc = crc32table_le[0][crc & 0xff] ^ (crc >> 8); + crc32table_le[j][i] = crc; + } } } @@ -39,25 +46,35 @@ static void crc32init_be(void) unsigned i, j; uint32_t crc = 0x80000000; - crc32table_be[0] = 0; + crc32table_be[0][0] = 0; for (i = 1; i < BE_TABLE_SIZE; i <<= 1) { crc = (crc << 1) ^ ((crc & 0x80000000) ? CRCPOLY_BE : 0); for (j = 0; j < i; j++) - crc32table_be[i + j] = crc ^ crc32table_be[j]; + crc32table_be[0][i + j] = crc ^ crc32table_be[0][j]; + } + for (i = 0; i < BE_TABLE_SIZE; i++) { + crc = crc32table_be[0][i]; + for (j = 1; j < 4; j++) { + crc = crc32table_be[0][(crc >> 24) & 0xff] ^ (crc << 8); + crc32table_be[j][i] = crc; + } } } -static void output_table(uint32_t table[], int len, char *trans) +static void output_table(uint32_t table[4][256], int len, char *trans) { - int i; + int i, j; - for (i = 0; i < len - 1; i++) { - if (i % ENTRIES_PER_LINE == 0) - printf("\n"); - printf("%s(0x%8.8xL), ", trans, table[i]); + for (j = 0 ; j < 4; j++) { + printf("{"); + for (i = 0; i < len - 1; i++) { + if (i % ENTRIES_PER_LINE == 0) + printf("\n"); + printf("%s(0x%8.8xL), ", trans, table[j][i]); + } + printf("%s(0x%8.8xL)},\n", trans, table[j][len - 1]); } - printf("%s(0x%8.8xL)\n", trans, table[len - 1]); } int main(int argc, char** argv) @@ -66,14 +83,14 @@ int main(int argc, char** argv) if (CRC_LE_BITS > 1) { crc32init_le(); - printf("static const u32 crc32table_le[] = {"); + printf("static const u32 crc32table_le[4][256] = {"); output_table(crc32table_le, LE_TABLE_SIZE, "tole"); printf("};\n"); } if (CRC_BE_BITS > 1) { crc32init_be(); - printf("static const u32 crc32table_be[] = {"); + printf("static const u32 crc32table_be[4][256] = {"); output_table(crc32table_be, BE_TABLE_SIZE, "tobe"); printf("};\n"); } diff --git a/lib/genalloc.c b/lib/genalloc.c index e67f97495dd5..1923f1490e72 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -10,6 +10,7 @@ * Version 2. See the file COPYING for more details. */ +#include <linux/slab.h> #include <linux/module.h> #include <linux/bitmap.h> #include <linux/genalloc.h> @@ -127,7 +128,6 @@ unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size) chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); end_bit = (chunk->end_addr - chunk->start_addr) >> order; - end_bit -= nbits + 1; spin_lock_irqsave(&chunk->lock, flags); start_bit = bitmap_find_next_zero_area(chunk->bits, end_bit, 0, diff --git a/lib/hexdump.c b/lib/hexdump.c index 39af2560f765..f5fe6ba7a3ab 100644 --- a/lib/hexdump.c +++ b/lib/hexdump.c @@ -16,6 +16,40 @@ const char hex_asc[] = "0123456789abcdef"; EXPORT_SYMBOL(hex_asc); /** + * hex_to_bin - convert a hex digit to its real value + * @ch: ascii character represents hex digit + * + * hex_to_bin() converts one hex digit to its actual value or -1 in case of bad + * input. + */ +int hex_to_bin(char ch) +{ + if ((ch >= '0') && (ch <= '9')) + return ch - '0'; + ch = tolower(ch); + if ((ch >= 'a') && (ch <= 'f')) + return ch - 'a' + 10; + return -1; +} +EXPORT_SYMBOL(hex_to_bin); + +/** + * hex2bin - convert an ascii hexadecimal string to its binary representation + * @dst: binary result + * @src: ascii hexadecimal string + * @count: result length + */ +void hex2bin(u8 *dst, const char *src, size_t count) +{ + while (count--) { + *dst = hex_to_bin(*src++) << 4; + *dst += hex_to_bin(*src++); + dst++; + } +} +EXPORT_SYMBOL(hex2bin); + +/** * hex_dump_to_buffer - convert a blob of data to "hex ASCII" in memory * @buf: data blob to dump * @len: number of bytes in the @buf @@ -34,7 +68,7 @@ EXPORT_SYMBOL(hex_asc); * * E.g.: * hex_dump_to_buffer(frame->data, frame->len, 16, 1, - * linebuf, sizeof(linebuf), 1); + * linebuf, sizeof(linebuf), true); * * example output buffer: * 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO @@ -65,8 +99,8 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize, for (j = 0; j < ngroups; j++) lx += scnprintf(linebuf + lx, linebuflen - lx, - "%s%16.16llx", j ? " " : "", - (unsigned long long)*(ptr8 + j)); + "%s%16.16llx", j ? " " : "", + (unsigned long long)*(ptr8 + j)); ascii_column = 17 * ngroups + 2; break; } @@ -77,7 +111,7 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize, for (j = 0; j < ngroups; j++) lx += scnprintf(linebuf + lx, linebuflen - lx, - "%s%8.8x", j ? " " : "", *(ptr4 + j)); + "%s%8.8x", j ? " " : "", *(ptr4 + j)); ascii_column = 9 * ngroups + 2; break; } @@ -88,7 +122,7 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize, for (j = 0; j < ngroups; j++) lx += scnprintf(linebuf + lx, linebuflen - lx, - "%s%4.4x", j ? " " : "", *(ptr2 + j)); + "%s%4.4x", j ? " " : "", *(ptr2 + j)); ascii_column = 5 * ngroups + 2; break; } @@ -111,14 +145,16 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize, while (lx < (linebuflen - 1) && lx < (ascii_column - 1)) linebuf[lx++] = ' '; - for (j = 0; (j < len) && (lx + 2) < linebuflen; j++) - linebuf[lx++] = (isascii(ptr[j]) && isprint(ptr[j])) ? ptr[j] - : '.'; + for (j = 0; (j < len) && (lx + 2) < linebuflen; j++) { + ch = ptr[j]; + linebuf[lx++] = (isascii(ch) && isprint(ch)) ? ch : '.'; + } nil: linebuf[lx++] = '\0'; } EXPORT_SYMBOL(hex_dump_to_buffer); +#ifdef CONFIG_PRINTK /** * print_hex_dump - print a text hex dump to syslog for a binary blob of data * @level: kernel log level (e.g. KERN_DEBUG) @@ -143,7 +179,7 @@ EXPORT_SYMBOL(hex_dump_to_buffer); * * E.g.: * print_hex_dump(KERN_DEBUG, "raw data: ", DUMP_PREFIX_ADDRESS, - * 16, 1, frame->data, frame->len, 1); + * 16, 1, frame->data, frame->len, true); * * Example output using %DUMP_PREFIX_OFFSET and 1-byte mode: * 0009ab42: 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO @@ -151,12 +187,12 @@ EXPORT_SYMBOL(hex_dump_to_buffer); * ffffffff88089af0: 73727170 77767574 7b7a7978 7f7e7d7c pqrstuvwxyz{|}~. */ void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, - int rowsize, int groupsize, - const void *buf, size_t len, bool ascii) + int rowsize, int groupsize, + const void *buf, size_t len, bool ascii) { const u8 *ptr = buf; int i, linelen, remaining = len; - unsigned char linebuf[200]; + unsigned char linebuf[32 * 3 + 2 + 32 + 1]; if (rowsize != 16 && rowsize != 32) rowsize = 16; @@ -164,13 +200,14 @@ void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, for (i = 0; i < len; i += rowsize) { linelen = min(remaining, rowsize); remaining -= rowsize; + hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, - linebuf, sizeof(linebuf), ascii); + linebuf, sizeof(linebuf), ascii); switch (prefix_type) { case DUMP_PREFIX_ADDRESS: - printk("%s%s%*p: %s\n", level, prefix_str, - (int)(2 * sizeof(void *)), ptr + i, linebuf); + printk("%s%s%p: %s\n", + level, prefix_str, ptr + i, linebuf); break; case DUMP_PREFIX_OFFSET: printk("%s%s%.8x: %s\n", level, prefix_str, i, linebuf); @@ -196,9 +233,10 @@ EXPORT_SYMBOL(print_hex_dump); * rowsize of 16, groupsize of 1, and ASCII output included. */ void print_hex_dump_bytes(const char *prefix_str, int prefix_type, - const void *buf, size_t len) + const void *buf, size_t len) { print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, 16, 1, - buf, len, 1); + buf, len, true); } EXPORT_SYMBOL(print_hex_dump_bytes); +#endif diff --git a/lib/hweight.c b/lib/hweight.c index 389424ecb129..3c79d50814cf 100644 --- a/lib/hweight.c +++ b/lib/hweight.c @@ -9,37 +9,45 @@ * The Hamming Weight of a number is the total number of bits set in it. */ -unsigned int hweight32(unsigned int w) +unsigned int __sw_hweight32(unsigned int w) { +#ifdef ARCH_HAS_FAST_MULTIPLIER + w -= (w >> 1) & 0x55555555; + w = (w & 0x33333333) + ((w >> 2) & 0x33333333); + w = (w + (w >> 4)) & 0x0f0f0f0f; + return (w * 0x01010101) >> 24; +#else unsigned int res = w - ((w >> 1) & 0x55555555); res = (res & 0x33333333) + ((res >> 2) & 0x33333333); res = (res + (res >> 4)) & 0x0F0F0F0F; res = res + (res >> 8); return (res + (res >> 16)) & 0x000000FF; +#endif } -EXPORT_SYMBOL(hweight32); +EXPORT_SYMBOL(__sw_hweight32); -unsigned int hweight16(unsigned int w) +unsigned int __sw_hweight16(unsigned int w) { unsigned int res = w - ((w >> 1) & 0x5555); res = (res & 0x3333) + ((res >> 2) & 0x3333); res = (res + (res >> 4)) & 0x0F0F; return (res + (res >> 8)) & 0x00FF; } -EXPORT_SYMBOL(hweight16); +EXPORT_SYMBOL(__sw_hweight16); -unsigned int hweight8(unsigned int w) +unsigned int __sw_hweight8(unsigned int w) { unsigned int res = w - ((w >> 1) & 0x55); res = (res & 0x33) + ((res >> 2) & 0x33); return (res + (res >> 4)) & 0x0F; } -EXPORT_SYMBOL(hweight8); +EXPORT_SYMBOL(__sw_hweight8); -unsigned long hweight64(__u64 w) +unsigned long __sw_hweight64(__u64 w) { #if BITS_PER_LONG == 32 - return hweight32((unsigned int)(w >> 32)) + hweight32((unsigned int)w); + return __sw_hweight32((unsigned int)(w >> 32)) + + __sw_hweight32((unsigned int)w); #elif BITS_PER_LONG == 64 #ifdef ARCH_HAS_FAST_MULTIPLIER w -= (w >> 1) & 0x5555555555555555ul; @@ -56,4 +64,4 @@ unsigned long hweight64(__u64 w) #endif #endif } -EXPORT_SYMBOL(hweight64); +EXPORT_SYMBOL(__sw_hweight64); diff --git a/lib/idr.c b/lib/idr.c index 1cac726c44bc..e15502e8b21e 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -106,16 +106,17 @@ static void idr_mark_full(struct idr_layer **pa, int id) } /** - * idr_pre_get - reserver resources for idr allocation + * idr_pre_get - reserve resources for idr allocation * @idp: idr handle * @gfp_mask: memory allocation flags * - * This function should be called prior to locking and calling the - * idr_get_new* functions. It preallocates enough memory to satisfy - * the worst possible allocation. + * This function should be called prior to calling the idr_get_new* functions. + * It preallocates enough memory to satisfy the worst possible allocation. The + * caller should pass in GFP_KERNEL if possible. This of course requires that + * no spinning locks be held. * - * If the system is REALLY out of memory this function returns 0, - * otherwise 1. + * If the system is REALLY out of memory this function returns %0, + * otherwise %1. */ int idr_pre_get(struct idr *idp, gfp_t gfp_mask) { @@ -156,10 +157,12 @@ static int sub_alloc(struct idr *idp, int *starting_id, struct idr_layer **pa) id = (id | ((1 << (IDR_BITS * l)) - 1)) + 1; /* if already at the top layer, we need to grow */ - if (!(p = pa[l])) { + if (id >= 1 << (idp->layers * IDR_BITS)) { *starting_id = id; return IDR_NEED_TO_GROW; } + p = pa[l]; + BUG_ON(!p); /* If we need to go up one layer, continue the * loop; otherwise, restart from the top. @@ -282,17 +285,19 @@ static int idr_get_new_above_int(struct idr *idp, void *ptr, int starting_id) * idr_get_new_above - allocate new idr entry above or equal to a start id * @idp: idr handle * @ptr: pointer you want associated with the id - * @start_id: id to start search at + * @starting_id: id to start search at * @id: pointer to the allocated handle * * This is the allocate id function. It should be called with any * required locks. * - * If memory is required, it will return -EAGAIN, you should unlock - * and go back to the idr_pre_get() call. If the idr is full, it will - * return -ENOSPC. + * If allocation from IDR's private freelist fails, idr_get_new_above() will + * return %-EAGAIN. The caller should retry the idr_pre_get() call to refill + * IDR's preallocation and then retry the idr_get_new_above() call. + * + * If the idr is full idr_get_new_above() will return %-ENOSPC. * - * @id returns a value in the range @starting_id ... 0x7fffffff + * @id returns a value in the range @starting_id ... %0x7fffffff */ int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id) { @@ -316,14 +321,13 @@ EXPORT_SYMBOL(idr_get_new_above); * @ptr: pointer you want associated with the id * @id: pointer to the allocated handle * - * This is the allocate id function. It should be called with any - * required locks. + * If allocation from IDR's private freelist fails, idr_get_new_above() will + * return %-EAGAIN. The caller should retry the idr_pre_get() call to refill + * IDR's preallocation and then retry the idr_get_new_above() call. * - * If memory is required, it will return -EAGAIN, you should unlock - * and go back to the idr_pre_get() call. If the idr is full, it will - * return -ENOSPC. + * If the idr is full idr_get_new_above() will return %-ENOSPC. * - * @id returns a value in the range 0 ... 0x7fffffff + * @id returns a value in the range %0 ... %0x7fffffff */ int idr_get_new(struct idr *idp, void *ptr, int *id) { @@ -386,7 +390,7 @@ static void sub_remove(struct idr *idp, int shift, int id) } /** - * idr_remove - remove the given id and free it's slot + * idr_remove - remove the given id and free its slot * @idp: idr handle * @id: unique key */ @@ -435,7 +439,7 @@ EXPORT_SYMBOL(idr_remove); * function will remove all id mappings and leave all idp_layers * unused. * - * A typical clean-up sequence for objects stored in an idr tree, will + * A typical clean-up sequence for objects stored in an idr tree will * use idr_for_each() to free all objects, if necessay, then * idr_remove_all() to remove all ids, and idr_destroy() to free * up the cached idr_layers. @@ -443,6 +447,7 @@ EXPORT_SYMBOL(idr_remove); void idr_remove_all(struct idr *idp) { int n, id, max; + int bt_mask; struct idr_layer *p; struct idr_layer *pa[MAX_LEVEL]; struct idr_layer **paa = &pa[0]; @@ -460,8 +465,10 @@ void idr_remove_all(struct idr *idp) p = p->ary[(id >> n) & IDR_MASK]; } + bt_mask = id; id += 1 << n; - while (n < fls(id)) { + /* Get the highest bit that the above add changed from 0->1. */ + while (n < fls(id ^ bt_mask)) { if (p) free_layer(p); n += IDR_BITS; @@ -474,7 +481,7 @@ EXPORT_SYMBOL(idr_remove_all); /** * idr_destroy - release all cached layers within an idr tree - * idp: idr handle + * @idp: idr handle */ void idr_destroy(struct idr *idp) { @@ -502,7 +509,7 @@ void *idr_find(struct idr *idp, int id) int n; struct idr_layer *p; - p = rcu_dereference(idp->top); + p = rcu_dereference_raw(idp->top); if (!p) return NULL; n = (p->layer+1) * IDR_BITS; @@ -517,7 +524,7 @@ void *idr_find(struct idr *idp, int id) while (n > 0 && p) { n -= IDR_BITS; BUG_ON(n != p->layer*IDR_BITS); - p = rcu_dereference(p->ary[(id >> n) & IDR_MASK]); + p = rcu_dereference_raw(p->ary[(id >> n) & IDR_MASK]); } return((void *)p); } @@ -537,7 +544,7 @@ EXPORT_SYMBOL(idr_find); * not allowed. * * We check the return of @fn each time. If it returns anything other - * than 0, we break out and return that value. + * than %0, we break out and return that value. * * The caller must serialize idr_for_each() vs idr_get_new() and idr_remove(). */ @@ -550,7 +557,7 @@ int idr_for_each(struct idr *idp, struct idr_layer **paa = &pa[0]; n = idp->layers * IDR_BITS; - p = rcu_dereference(idp->top); + p = rcu_dereference_raw(idp->top); max = 1 << n; id = 0; @@ -558,7 +565,7 @@ int idr_for_each(struct idr *idp, while (n > 0 && p) { n -= IDR_BITS; *paa++ = p; - p = rcu_dereference(p->ary[(id >> n) & IDR_MASK]); + p = rcu_dereference_raw(p->ary[(id >> n) & IDR_MASK]); } if (p) { @@ -581,10 +588,11 @@ EXPORT_SYMBOL(idr_for_each); /** * idr_get_next - lookup next object of id to given id. * @idp: idr handle - * @id: pointer to lookup key + * @nextidp: pointer to lookup key * * Returns pointer to registered object with id, which is next number to - * given id. + * given id. After being looked up, *@nextidp will be updated for the next + * iteration. */ void *idr_get_next(struct idr *idp, int *nextidp) @@ -597,7 +605,7 @@ void *idr_get_next(struct idr *idp, int *nextidp) /* find first ent */ n = idp->layers * IDR_BITS; max = 1 << n; - p = rcu_dereference(idp->top); + p = rcu_dereference_raw(idp->top); if (!p) return NULL; @@ -605,7 +613,7 @@ void *idr_get_next(struct idr *idp, int *nextidp) while (n > 0 && p) { n -= IDR_BITS; *paa++ = p; - p = rcu_dereference(p->ary[(id >> n) & IDR_MASK]); + p = rcu_dereference_raw(p->ary[(id >> n) & IDR_MASK]); } if (p) { @@ -621,7 +629,7 @@ void *idr_get_next(struct idr *idp, int *nextidp) } return NULL; } - +EXPORT_SYMBOL(idr_get_next); /** @@ -631,8 +639,8 @@ void *idr_get_next(struct idr *idp, int *nextidp) * @id: lookup key * * Replace the pointer registered with an id and return the old value. - * A -ENOENT return indicates that @id was not found. - * A -EINVAL return indicates that @id was not within valid constraints. + * A %-ENOENT return indicates that @id was not found. + * A %-EINVAL return indicates that @id was not within valid constraints. * * The caller must serialize with writers. */ @@ -690,10 +698,11 @@ void idr_init(struct idr *idp) EXPORT_SYMBOL(idr_init); -/* +/** + * DOC: IDA description * IDA - IDR based ID allocator * - * this is id allocator without id -> pointer translation. Memory + * This is id allocator without id -> pointer translation. Memory * usage is much lower than full blown idr because each id only * occupies a bit. ida uses a custom leaf node which contains * IDA_BITMAP_BITS slots. @@ -726,8 +735,8 @@ static void free_bitmap(struct ida *ida, struct ida_bitmap *bitmap) * following function. It preallocates enough memory to satisfy the * worst possible allocation. * - * If the system is REALLY out of memory this function returns 0, - * otherwise 1. + * If the system is REALLY out of memory this function returns %0, + * otherwise %1. */ int ida_pre_get(struct ida *ida, gfp_t gfp_mask) { @@ -753,17 +762,17 @@ EXPORT_SYMBOL(ida_pre_get); /** * ida_get_new_above - allocate new ID above or equal to a start id * @ida: ida handle - * @staring_id: id to start search at + * @starting_id: id to start search at * @p_id: pointer to the allocated handle * * Allocate new ID above or equal to @ida. It should be called with * any required locks. * - * If memory is required, it will return -EAGAIN, you should unlock + * If memory is required, it will return %-EAGAIN, you should unlock * and go back to the ida_pre_get() call. If the ida is full, it will - * return -ENOSPC. + * return %-ENOSPC. * - * @p_id returns a value in the range @starting_id ... 0x7fffffff. + * @p_id returns a value in the range @starting_id ... %0x7fffffff. */ int ida_get_new_above(struct ida *ida, int starting_id, int *p_id) { @@ -845,11 +854,11 @@ EXPORT_SYMBOL(ida_get_new_above); * * Allocate new ID. It should be called with any required locks. * - * If memory is required, it will return -EAGAIN, you should unlock + * If memory is required, it will return %-EAGAIN, you should unlock * and go back to the idr_pre_get() call. If the idr is full, it will - * return -ENOSPC. + * return %-ENOSPC. * - * @id returns a value in the range 0 ... 0x7fffffff. + * @id returns a value in the range %0 ... %0x7fffffff. */ int ida_get_new(struct ida *ida, int *p_id) { @@ -907,7 +916,7 @@ EXPORT_SYMBOL(ida_remove); /** * ida_destroy - release all cached layers within an ida tree - * ida: ida handle + * @ida: ida handle */ void ida_destroy(struct ida *ida) { diff --git a/lib/inflate.c b/lib/inflate.c index d10255973a9f..013a76193481 100644 --- a/lib/inflate.c +++ b/lib/inflate.c @@ -103,6 +103,9 @@ the two sets of lengths. */ #include <linux/compiler.h> +#ifdef NO_INFLATE_MALLOC +#include <linux/slab.h> +#endif #ifdef RCSID static char rcsid[] = "#Id: inflate.c,v 0.14 1993/06/10 13:27:04 jloup Exp #"; diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c index c0251f4ad08b..da053313ee5c 100644 --- a/lib/iommu-helper.c +++ b/lib/iommu-helper.c @@ -38,12 +38,3 @@ again: return -1; } EXPORT_SYMBOL(iommu_area_alloc); - -unsigned long iommu_num_pages(unsigned long addr, unsigned long len, - unsigned long io_page_size) -{ - unsigned long size = (addr & (io_page_size - 1)) + len; - - return DIV_ROUND_UP(size, io_page_size); -} -EXPORT_SYMBOL(iommu_num_pages); diff --git a/lib/ioremap.c b/lib/ioremap.c index 14c6078f17a2..da4e2ad74b68 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -9,14 +9,15 @@ #include <linux/mm.h> #include <linux/sched.h> #include <linux/io.h> +#include <linux/module.h> #include <asm/cacheflush.h> #include <asm/pgtable.h> static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, unsigned long phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { pte_t *pte; - unsigned long pfn; + u64 pfn; pfn = phys_addr >> PAGE_SHIFT; pte = pte_alloc_kernel(pmd, addr); @@ -31,7 +32,7 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, } static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, unsigned long phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { pmd_t *pmd; unsigned long next; @@ -49,7 +50,7 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, } static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr, - unsigned long end, unsigned long phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { pud_t *pud; unsigned long next; @@ -67,7 +68,7 @@ static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr, } int ioremap_page_range(unsigned long addr, - unsigned long end, unsigned long phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { pgd_t *pgd; unsigned long start; @@ -90,3 +91,4 @@ int ioremap_page_range(unsigned long addr, return err; } +EXPORT_SYMBOL_GPL(ioremap_page_range); diff --git a/lib/kasprintf.c b/lib/kasprintf.c index c5ff1fd10030..9c4233b23783 100644 --- a/lib/kasprintf.c +++ b/lib/kasprintf.c @@ -6,6 +6,7 @@ #include <stdarg.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> diff --git a/lib/kobject.c b/lib/kobject.c index b512b746d2af..82dc34c095c2 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -700,7 +700,7 @@ static ssize_t kobj_attr_store(struct kobject *kobj, struct attribute *attr, return ret; } -struct sysfs_ops kobj_sysfs_ops = { +const struct sysfs_ops kobj_sysfs_ops = { .show = kobj_attr_show, .store = kobj_attr_store, }; @@ -746,17 +746,56 @@ void kset_unregister(struct kset *k) */ struct kobject *kset_find_obj(struct kset *kset, const char *name) { + return kset_find_obj_hinted(kset, name, NULL); +} + +/** + * kset_find_obj_hinted - search for object in kset given a predecessor hint. + * @kset: kset we're looking in. + * @name: object's name. + * @hint: hint to possible object's predecessor. + * + * Check the hint's next object and if it is a match return it directly, + * otherwise, fall back to the behavior of kset_find_obj(). Either way + * a reference for the returned object is held and the reference on the + * hinted object is released. + */ +struct kobject *kset_find_obj_hinted(struct kset *kset, const char *name, + struct kobject *hint) +{ struct kobject *k; struct kobject *ret = NULL; spin_lock(&kset->list_lock); + + if (!hint) + goto slow_search; + + /* end of list detection */ + if (hint->entry.next == kset->list.next) + goto slow_search; + + k = container_of(hint->entry.next, struct kobject, entry); + if (!kobject_name(k) || strcmp(kobject_name(k), name)) + goto slow_search; + + ret = kobject_get(k); + goto unlock_exit; + +slow_search: list_for_each_entry(k, &kset->list, entry) { if (kobject_name(k) && !strcmp(kobject_name(k), name)) { ret = kobject_get(k); break; } } + +unlock_exit: spin_unlock(&kset->list_lock); + + if (hint) + kobject_put(hint); + return ret; } @@ -789,7 +828,7 @@ static struct kobj_type kset_ktype = { * If the kset was not able to be created, NULL will be returned. */ static struct kset *kset_create(const char *name, - struct kset_uevent_ops *uevent_ops, + const struct kset_uevent_ops *uevent_ops, struct kobject *parent_kobj) { struct kset *kset; @@ -832,7 +871,7 @@ static struct kset *kset_create(const char *name, * If the kset was not able to be created, NULL will be returned. */ struct kset *kset_create_and_add(const char *name, - struct kset_uevent_ops *uevent_ops, + const struct kset_uevent_ops *uevent_ops, struct kobject *parent_kobj) { struct kset *kset; @@ -850,6 +889,121 @@ struct kset *kset_create_and_add(const char *name, } EXPORT_SYMBOL_GPL(kset_create_and_add); + +static DEFINE_SPINLOCK(kobj_ns_type_lock); +static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES]; + +int kobj_ns_type_register(const struct kobj_ns_type_operations *ops) +{ + enum kobj_ns_type type = ops->type; + int error; + + spin_lock(&kobj_ns_type_lock); + + error = -EINVAL; + if (type >= KOBJ_NS_TYPES) + goto out; + + error = -EINVAL; + if (type <= KOBJ_NS_TYPE_NONE) + goto out; + + error = -EBUSY; + if (kobj_ns_ops_tbl[type]) + goto out; + + error = 0; + kobj_ns_ops_tbl[type] = ops; + +out: + spin_unlock(&kobj_ns_type_lock); + return error; +} + +int kobj_ns_type_registered(enum kobj_ns_type type) +{ + int registered = 0; + + spin_lock(&kobj_ns_type_lock); + if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES)) + registered = kobj_ns_ops_tbl[type] != NULL; + spin_unlock(&kobj_ns_type_lock); + + return registered; +} + +const struct kobj_ns_type_operations *kobj_child_ns_ops(struct kobject *parent) +{ + const struct kobj_ns_type_operations *ops = NULL; + + if (parent && parent->ktype->child_ns_type) + ops = parent->ktype->child_ns_type(parent); + + return ops; +} + +const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj) +{ + return kobj_child_ns_ops(kobj->parent); +} + + +const void *kobj_ns_current(enum kobj_ns_type type) +{ + const void *ns = NULL; + + spin_lock(&kobj_ns_type_lock); + if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) && + kobj_ns_ops_tbl[type]) + ns = kobj_ns_ops_tbl[type]->current_ns(); + spin_unlock(&kobj_ns_type_lock); + + return ns; +} + +const void *kobj_ns_netlink(enum kobj_ns_type type, struct sock *sk) +{ + const void *ns = NULL; + + spin_lock(&kobj_ns_type_lock); + if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) && + kobj_ns_ops_tbl[type]) + ns = kobj_ns_ops_tbl[type]->netlink_ns(sk); + spin_unlock(&kobj_ns_type_lock); + + return ns; +} + +const void *kobj_ns_initial(enum kobj_ns_type type) +{ + const void *ns = NULL; + + spin_lock(&kobj_ns_type_lock); + if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) && + kobj_ns_ops_tbl[type]) + ns = kobj_ns_ops_tbl[type]->initial_ns(); + spin_unlock(&kobj_ns_type_lock); + + return ns; +} + +/* + * kobj_ns_exit - invalidate a namespace tag + * + * @type: the namespace type (i.e. KOBJ_NS_TYPE_NET) + * @ns: the actual namespace being invalidated + * + * This is called when a tag is no longer valid. For instance, + * when a network namespace exits, it uses this helper to + * make sure no sb's sysfs_info points to the now-invalidated + * netns. + */ +void kobj_ns_exit(enum kobj_ns_type type, const void *ns) +{ + sysfs_exit_ns(type, ns); +} + + EXPORT_SYMBOL(kobject_get); EXPORT_SYMBOL(kobject_put); EXPORT_SYMBOL(kobject_del); diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 920a3ca6e259..70af0a7f97c0 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -18,18 +18,25 @@ #include <linux/string.h> #include <linux/kobject.h> #include <linux/module.h> - +#include <linux/slab.h> +#include <linux/user_namespace.h> #include <linux/socket.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <net/sock.h> +#include <net/net_namespace.h> u64 uevent_seqnum; char uevent_helper[UEVENT_HELPER_PATH_LEN] = CONFIG_UEVENT_HELPER_PATH; static DEFINE_SPINLOCK(sequence_lock); -#if defined(CONFIG_NET) -static struct sock *uevent_sock; +#ifdef CONFIG_NET +struct uevent_sock { + struct list_head list; + struct sock *sk; +}; +static LIST_HEAD(uevent_sock_list); +static DEFINE_MUTEX(uevent_sock_mutex); #endif /* the strings here must match the enum in include/linux/kobject.h */ @@ -76,6 +83,39 @@ out: return ret; } +#ifdef CONFIG_NET +static int kobj_bcast_filter(struct sock *dsk, struct sk_buff *skb, void *data) +{ + struct kobject *kobj = data; + const struct kobj_ns_type_operations *ops; + + ops = kobj_ns_ops(kobj); + if (ops) { + const void *sock_ns, *ns; + ns = kobj->ktype->namespace(kobj); + sock_ns = ops->netlink_ns(dsk); + return sock_ns != ns; + } + + return 0; +} +#endif + +static int kobj_usermode_filter(struct kobject *kobj) +{ + const struct kobj_ns_type_operations *ops; + + ops = kobj_ns_ops(kobj); + if (ops) { + const void *init_ns, *ns; + ns = kobj->ktype->namespace(kobj); + init_ns = ops->initial_ns(); + return ns != init_ns; + } + + return 0; +} + /** * kobject_uevent_env - send an uevent with environmental data * @@ -83,7 +123,7 @@ out: * @kobj: struct kobject that the action is happening to * @envp_ext: pointer to environmental data * - * Returns 0 if kobject_uevent() is completed with success or the + * Returns 0 if kobject_uevent_env() is completed with success or the * corresponding error when it fails. */ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, @@ -95,10 +135,13 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, const char *subsystem; struct kobject *top_kobj; struct kset *kset; - struct kset_uevent_ops *uevent_ops; + const struct kset_uevent_ops *uevent_ops; u64 seq; int i = 0; int retval = 0; +#ifdef CONFIG_NET + struct uevent_sock *ue_sk; +#endif pr_debug("kobject: '%s' (%p): %s\n", kobject_name(kobj), kobj, __func__); @@ -210,7 +253,9 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, #if defined(CONFIG_NET) /* send netlink message */ - if (uevent_sock) { + mutex_lock(&uevent_sock_mutex); + list_for_each_entry(ue_sk, &uevent_sock_list, list) { + struct sock *uevent_sock = ue_sk->sk; struct sk_buff *skb; size_t len; @@ -232,18 +277,21 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, } NETLINK_CB(skb).dst_group = 1; - retval = netlink_broadcast(uevent_sock, skb, 0, 1, - GFP_KERNEL); + retval = netlink_broadcast_filtered(uevent_sock, skb, + 0, 1, GFP_KERNEL, + kobj_bcast_filter, + kobj); /* ENOBUFS should be handled in userspace */ if (retval == -ENOBUFS) retval = 0; } else retval = -ENOMEM; } + mutex_unlock(&uevent_sock_mutex); #endif /* call uevent_helper, usually only enabled during early boot */ - if (uevent_helper[0]) { + if (uevent_helper[0] && !kobj_usermode_filter(kobj)) { char *argv [3]; argv [0] = uevent_helper; @@ -269,7 +317,7 @@ exit: EXPORT_SYMBOL_GPL(kobject_uevent_env); /** - * kobject_uevent - notify userspace by ending an uevent + * kobject_uevent - notify userspace by sending an uevent * * @action: action that is happening * @kobj: struct kobject that the action is happening to @@ -319,18 +367,59 @@ int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...) EXPORT_SYMBOL_GPL(add_uevent_var); #if defined(CONFIG_NET) -static int __init kobject_uevent_init(void) +static int uevent_net_init(struct net *net) { - uevent_sock = netlink_kernel_create(&init_net, NETLINK_KOBJECT_UEVENT, - 1, NULL, NULL, THIS_MODULE); - if (!uevent_sock) { + struct uevent_sock *ue_sk; + + ue_sk = kzalloc(sizeof(*ue_sk), GFP_KERNEL); + if (!ue_sk) + return -ENOMEM; + + ue_sk->sk = netlink_kernel_create(net, NETLINK_KOBJECT_UEVENT, + 1, NULL, NULL, THIS_MODULE); + if (!ue_sk->sk) { printk(KERN_ERR "kobject_uevent: unable to create netlink socket!\n"); + kfree(ue_sk); return -ENODEV; } - netlink_set_nonroot(NETLINK_KOBJECT_UEVENT, NL_NONROOT_RECV); + mutex_lock(&uevent_sock_mutex); + list_add_tail(&ue_sk->list, &uevent_sock_list); + mutex_unlock(&uevent_sock_mutex); return 0; } +static void uevent_net_exit(struct net *net) +{ + struct uevent_sock *ue_sk; + + mutex_lock(&uevent_sock_mutex); + list_for_each_entry(ue_sk, &uevent_sock_list, list) { + if (sock_net(ue_sk->sk) == net) + goto found; + } + mutex_unlock(&uevent_sock_mutex); + return; + +found: + list_del(&ue_sk->list); + mutex_unlock(&uevent_sock_mutex); + + netlink_kernel_release(ue_sk->sk); + kfree(ue_sk); +} + +static struct pernet_operations uevent_net_ops = { + .init = uevent_net_init, + .exit = uevent_net_exit, +}; + +static int __init kobject_uevent_init(void) +{ + netlink_set_nonroot(NETLINK_KOBJECT_UEVENT, NL_NONROOT_RECV); + return register_pernet_subsys(&uevent_net_ops); +} + + postcore_initcall(kobject_uevent_init); #endif diff --git a/lib/kref.c b/lib/kref.c index 9ecd6e865610..3efb882b11db 100644 --- a/lib/kref.c +++ b/lib/kref.c @@ -13,17 +13,7 @@ #include <linux/kref.h> #include <linux/module.h> - -/** - * kref_set - initialize object and set refcount to requested number. - * @kref: object in question. - * @num: initial reference counter - */ -void kref_set(struct kref *kref, int num) -{ - atomic_set(&kref->refcount, num); - smp_mb(); -} +#include <linux/slab.h> /** * kref_init - initialize object. @@ -31,7 +21,8 @@ void kref_set(struct kref *kref, int num) */ void kref_init(struct kref *kref) { - kref_set(kref, 1); + atomic_set(&kref->refcount, 1); + smp_mb(); } /** @@ -71,7 +62,36 @@ int kref_put(struct kref *kref, void (*release)(struct kref *kref)) return 0; } -EXPORT_SYMBOL(kref_set); + +/** + * kref_sub - subtract a number of refcounts for object. + * @kref: object. + * @count: Number of recounts to subtract. + * @release: pointer to the function that will clean up the object when the + * last reference to the object is released. + * This pointer is required, and it is not acceptable to pass kfree + * in as this function. + * + * Subtract @count from the refcount, and if 0, call release(). + * Return 1 if the object was removed, otherwise return 0. Beware, if this + * function returns 0, you still can not count on the kref from remaining in + * memory. Only use the return value if you want to see if the kref is now + * gone, not present. + */ +int kref_sub(struct kref *kref, unsigned int count, + void (*release)(struct kref *kref)) +{ + WARN_ON(release == NULL); + WARN_ON(release == (void (*)(struct kref *))kfree); + + if (atomic_sub_and_test((int) count, &kref->refcount)) { + release(kref); + return 1; + } + return 0; +} + EXPORT_SYMBOL(kref_init); EXPORT_SYMBOL(kref_get); EXPORT_SYMBOL(kref_put); +EXPORT_SYMBOL(kref_sub); diff --git a/lib/lcm.c b/lib/lcm.c new file mode 100644 index 000000000000..157cd88a6ffc --- /dev/null +++ b/lib/lcm.c @@ -0,0 +1,15 @@ +#include <linux/kernel.h> +#include <linux/gcd.h> +#include <linux/module.h> + +/* Lowest common multiple */ +unsigned long lcm(unsigned long a, unsigned long b) +{ + if (a && b) + return (a * b) / gcd(a, b); + else if (b) + return b; + + return a; +} +EXPORT_SYMBOL_GPL(lcm); diff --git a/lib/list_debug.c b/lib/list_debug.c index 1a39f4e3ae1f..344c710d16ca 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -43,6 +43,12 @@ EXPORT_SYMBOL(__list_add); */ void list_del(struct list_head *entry) { + WARN(entry->next == LIST_POISON1, + "list_del corruption, next is LIST_POISON1 (%p)\n", + LIST_POISON1); + WARN(entry->next != LIST_POISON1 && entry->prev == LIST_POISON2, + "list_del corruption, prev is LIST_POISON2 (%p)\n", + LIST_POISON2); WARN(entry->prev->next != entry, "list_del corruption. prev->next should be %p, " "but was %p\n", entry, entry->prev->next); diff --git a/lib/list_sort.c b/lib/list_sort.c index 19d11e0bb958..d7325c6b103f 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -4,99 +4,288 @@ #include <linux/slab.h> #include <linux/list.h> +#define MAX_LIST_LENGTH_BITS 20 + +/* + * Returns a list organized in an intermediate format suited + * to chaining of merge() calls: null-terminated, no reserved or + * sentinel head node, "prev" links not maintained. + */ +static struct list_head *merge(void *priv, + int (*cmp)(void *priv, struct list_head *a, + struct list_head *b), + struct list_head *a, struct list_head *b) +{ + struct list_head head, *tail = &head; + + while (a && b) { + /* if equal, take 'a' -- important for sort stability */ + if ((*cmp)(priv, a, b) <= 0) { + tail->next = a; + a = a->next; + } else { + tail->next = b; + b = b->next; + } + tail = tail->next; + } + tail->next = a?:b; + return head.next; +} + +/* + * Combine final list merge with restoration of standard doubly-linked + * list structure. This approach duplicates code from merge(), but + * runs faster than the tidier alternatives of either a separate final + * prev-link restoration pass, or maintaining the prev links + * throughout. + */ +static void merge_and_restore_back_links(void *priv, + int (*cmp)(void *priv, struct list_head *a, + struct list_head *b), + struct list_head *head, + struct list_head *a, struct list_head *b) +{ + struct list_head *tail = head; + + while (a && b) { + /* if equal, take 'a' -- important for sort stability */ + if ((*cmp)(priv, a, b) <= 0) { + tail->next = a; + a->prev = tail; + a = a->next; + } else { + tail->next = b; + b->prev = tail; + b = b->next; + } + tail = tail->next; + } + tail->next = a ? : b; + + do { + /* + * In worst cases this loop may run many iterations. + * Continue callbacks to the client even though no + * element comparison is needed, so the client's cmp() + * routine can invoke cond_resched() periodically. + */ + (*cmp)(priv, tail->next, tail->next); + + tail->next->prev = tail; + tail = tail->next; + } while (tail->next); + + tail->next = head; + head->prev = tail; +} + /** - * list_sort - sort a list. - * @priv: private data, passed to @cmp + * list_sort - sort a list + * @priv: private data, opaque to list_sort(), passed to @cmp * @head: the list to sort * @cmp: the elements comparison function * - * This function has been implemented by Mark J Roberts <mjr@znex.org>. It - * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted - * in ascending order. + * This function implements "merge sort", which has O(nlog(n)) + * complexity. * - * The comparison function @cmp is supposed to return a negative value if @a is - * less than @b, and a positive value if @a is greater than @b. If @a and @b - * are equivalent, then it does not matter what this function returns. + * The comparison function @cmp must return a negative value if @a + * should sort before @b, and a positive value if @a should sort after + * @b. If @a and @b are equivalent, and their original relative + * ordering is to be preserved, @cmp must return 0. */ void list_sort(void *priv, struct list_head *head, - int (*cmp)(void *priv, struct list_head *a, - struct list_head *b)) + int (*cmp)(void *priv, struct list_head *a, + struct list_head *b)) { - struct list_head *p, *q, *e, *list, *tail, *oldhead; - int insize, nmerges, psize, qsize, i; + struct list_head *part[MAX_LIST_LENGTH_BITS+1]; /* sorted partial lists + -- last slot is a sentinel */ + int lev; /* index into part[] */ + int max_lev = 0; + struct list_head *list; if (list_empty(head)) return; + memset(part, 0, sizeof(part)); + + head->prev->next = NULL; list = head->next; - list_del(head); - insize = 1; - for (;;) { - p = oldhead = list; - list = tail = NULL; - nmerges = 0; - - while (p) { - nmerges++; - q = p; - psize = 0; - for (i = 0; i < insize; i++) { - psize++; - q = q->next == oldhead ? NULL : q->next; - if (!q) - break; - } - qsize = insize; - while (psize > 0 || (qsize > 0 && q)) { - if (!psize) { - e = q; - q = q->next; - qsize--; - if (q == oldhead) - q = NULL; - } else if (!qsize || !q) { - e = p; - p = p->next; - psize--; - if (p == oldhead) - p = NULL; - } else if (cmp(priv, p, q) <= 0) { - e = p; - p = p->next; - psize--; - if (p == oldhead) - p = NULL; - } else { - e = q; - q = q->next; - qsize--; - if (q == oldhead) - q = NULL; - } - if (tail) - tail->next = e; - else - list = e; - e->prev = tail; - tail = e; + while (list) { + struct list_head *cur = list; + list = list->next; + cur->next = NULL; + + for (lev = 0; part[lev]; lev++) { + cur = merge(priv, cmp, part[lev], cur); + part[lev] = NULL; + } + if (lev > max_lev) { + if (unlikely(lev >= ARRAY_SIZE(part)-1)) { + printk_once(KERN_DEBUG "list passed to" + " list_sort() too long for" + " efficiency\n"); + lev--; } - p = q; + max_lev = lev; } + part[lev] = cur; + } + + for (lev = 0; lev < max_lev; lev++) + if (part[lev]) + list = merge(priv, cmp, part[lev], list); + + merge_and_restore_back_links(priv, cmp, head, part[max_lev], list); +} +EXPORT_SYMBOL(list_sort); + +#ifdef CONFIG_TEST_LIST_SORT + +#include <linux/random.h> - tail->next = list; - list->prev = tail; +/* + * The pattern of set bits in the list length determines which cases + * are hit in list_sort(). + */ +#define TEST_LIST_LEN (512+128+2) /* not including head */ + +#define TEST_POISON1 0xDEADBEEF +#define TEST_POISON2 0xA324354C - if (nmerges <= 1) - break; +struct debug_el { + unsigned int poison1; + struct list_head list; + unsigned int poison2; + int value; + unsigned serial; +}; - insize *= 2; +/* Array, containing pointers to all elements in the test list */ +static struct debug_el **elts __initdata; + +static int __init check(struct debug_el *ela, struct debug_el *elb) +{ + if (ela->serial >= TEST_LIST_LEN) { + printk(KERN_ERR "list_sort_test: error: incorrect serial %d\n", + ela->serial); + return -EINVAL; } + if (elb->serial >= TEST_LIST_LEN) { + printk(KERN_ERR "list_sort_test: error: incorrect serial %d\n", + elb->serial); + return -EINVAL; + } + if (elts[ela->serial] != ela || elts[elb->serial] != elb) { + printk(KERN_ERR "list_sort_test: error: phantom element\n"); + return -EINVAL; + } + if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) { + printk(KERN_ERR "list_sort_test: error: bad poison: %#x/%#x\n", + ela->poison1, ela->poison2); + return -EINVAL; + } + if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) { + printk(KERN_ERR "list_sort_test: error: bad poison: %#x/%#x\n", + elb->poison1, elb->poison2); + return -EINVAL; + } + return 0; +} + +static int __init cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct debug_el *ela, *elb; + + ela = container_of(a, struct debug_el, list); + elb = container_of(b, struct debug_el, list); - head->next = list; - head->prev = list->prev; - list->prev->next = head; - list->prev = head; + check(ela, elb); + return ela->value - elb->value; } -EXPORT_SYMBOL(list_sort); +static int __init list_sort_test(void) +{ + int i, count = 1, err = -EINVAL; + struct debug_el *el; + struct list_head *cur, *tmp; + LIST_HEAD(head); + + printk(KERN_DEBUG "list_sort_test: start testing list_sort()\n"); + + elts = kmalloc(sizeof(void *) * TEST_LIST_LEN, GFP_KERNEL); + if (!elts) { + printk(KERN_ERR "list_sort_test: error: cannot allocate " + "memory\n"); + goto exit; + } + + for (i = 0; i < TEST_LIST_LEN; i++) { + el = kmalloc(sizeof(*el), GFP_KERNEL); + if (!el) { + printk(KERN_ERR "list_sort_test: error: cannot " + "allocate memory\n"); + goto exit; + } + /* force some equivalencies */ + el->value = random32() % (TEST_LIST_LEN/3); + el->serial = i; + el->poison1 = TEST_POISON1; + el->poison2 = TEST_POISON2; + elts[i] = el; + list_add_tail(&el->list, &head); + } + + list_sort(NULL, &head, cmp); + + for (cur = head.next; cur->next != &head; cur = cur->next) { + struct debug_el *el1; + int cmp_result; + + if (cur->next->prev != cur) { + printk(KERN_ERR "list_sort_test: error: list is " + "corrupted\n"); + goto exit; + } + + cmp_result = cmp(NULL, cur, cur->next); + if (cmp_result > 0) { + printk(KERN_ERR "list_sort_test: error: list is not " + "sorted\n"); + goto exit; + } + + el = container_of(cur, struct debug_el, list); + el1 = container_of(cur->next, struct debug_el, list); + if (cmp_result == 0 && el->serial >= el1->serial) { + printk(KERN_ERR "list_sort_test: error: order of " + "equivalent elements not preserved\n"); + goto exit; + } + + if (check(el, el1)) { + printk(KERN_ERR "list_sort_test: error: element check " + "failed\n"); + goto exit; + } + count++; + } + + if (count != TEST_LIST_LEN) { + printk(KERN_ERR "list_sort_test: error: bad list length %d", + count); + goto exit; + } + + err = 0; +exit: + kfree(elts); + list_for_each_safe(cur, tmp, &head) { + list_del(cur); + kfree(container_of(cur, struct debug_el, list)); + } + return err; +} +module_init(list_sort_test); +#endif /* CONFIG_TEST_LIST_SORT */ diff --git a/lib/lmb.c b/lib/lmb.c deleted file mode 100644 index 9cee17142b2c..000000000000 --- a/lib/lmb.c +++ /dev/null @@ -1,532 +0,0 @@ -/* - * Procedures for maintaining information about logical memory blocks. - * - * Peter Bergner, IBM Corp. June 2001. - * Copyright (C) 2001 Peter Bergner. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/bitops.h> -#include <linux/lmb.h> - -#define LMB_ALLOC_ANYWHERE 0 - -struct lmb lmb; - -static int lmb_debug; - -static int __init early_lmb(char *p) -{ - if (p && strstr(p, "debug")) - lmb_debug = 1; - return 0; -} -early_param("lmb", early_lmb); - -static void lmb_dump(struct lmb_region *region, char *name) -{ - unsigned long long base, size; - int i; - - pr_info(" %s.cnt = 0x%lx\n", name, region->cnt); - - for (i = 0; i < region->cnt; i++) { - base = region->region[i].base; - size = region->region[i].size; - - pr_info(" %s[0x%x]\t0x%016llx - 0x%016llx, 0x%llx bytes\n", - name, i, base, base + size - 1, size); - } -} - -void lmb_dump_all(void) -{ - if (!lmb_debug) - return; - - pr_info("LMB configuration:\n"); - pr_info(" rmo_size = 0x%llx\n", (unsigned long long)lmb.rmo_size); - pr_info(" memory.size = 0x%llx\n", (unsigned long long)lmb.memory.size); - - lmb_dump(&lmb.memory, "memory"); - lmb_dump(&lmb.reserved, "reserved"); -} - -static unsigned long lmb_addrs_overlap(u64 base1, u64 size1, u64 base2, - u64 size2) -{ - return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); -} - -static long lmb_addrs_adjacent(u64 base1, u64 size1, u64 base2, u64 size2) -{ - if (base2 == base1 + size1) - return 1; - else if (base1 == base2 + size2) - return -1; - - return 0; -} - -static long lmb_regions_adjacent(struct lmb_region *rgn, - unsigned long r1, unsigned long r2) -{ - u64 base1 = rgn->region[r1].base; - u64 size1 = rgn->region[r1].size; - u64 base2 = rgn->region[r2].base; - u64 size2 = rgn->region[r2].size; - - return lmb_addrs_adjacent(base1, size1, base2, size2); -} - -static void lmb_remove_region(struct lmb_region *rgn, unsigned long r) -{ - unsigned long i; - - for (i = r; i < rgn->cnt - 1; i++) { - rgn->region[i].base = rgn->region[i + 1].base; - rgn->region[i].size = rgn->region[i + 1].size; - } - rgn->cnt--; -} - -/* Assumption: base addr of region 1 < base addr of region 2 */ -static void lmb_coalesce_regions(struct lmb_region *rgn, - unsigned long r1, unsigned long r2) -{ - rgn->region[r1].size += rgn->region[r2].size; - lmb_remove_region(rgn, r2); -} - -void __init lmb_init(void) -{ - /* Create a dummy zero size LMB which will get coalesced away later. - * This simplifies the lmb_add() code below... - */ - lmb.memory.region[0].base = 0; - lmb.memory.region[0].size = 0; - lmb.memory.cnt = 1; - - /* Ditto. */ - lmb.reserved.region[0].base = 0; - lmb.reserved.region[0].size = 0; - lmb.reserved.cnt = 1; -} - -void __init lmb_analyze(void) -{ - int i; - - lmb.memory.size = 0; - - for (i = 0; i < lmb.memory.cnt; i++) - lmb.memory.size += lmb.memory.region[i].size; -} - -static long lmb_add_region(struct lmb_region *rgn, u64 base, u64 size) -{ - unsigned long coalesced = 0; - long adjacent, i; - - if ((rgn->cnt == 1) && (rgn->region[0].size == 0)) { - rgn->region[0].base = base; - rgn->region[0].size = size; - return 0; - } - - /* First try and coalesce this LMB with another. */ - for (i = 0; i < rgn->cnt; i++) { - u64 rgnbase = rgn->region[i].base; - u64 rgnsize = rgn->region[i].size; - - if ((rgnbase == base) && (rgnsize == size)) - /* Already have this region, so we're done */ - return 0; - - adjacent = lmb_addrs_adjacent(base, size, rgnbase, rgnsize); - if (adjacent > 0) { - rgn->region[i].base -= size; - rgn->region[i].size += size; - coalesced++; - break; - } else if (adjacent < 0) { - rgn->region[i].size += size; - coalesced++; - break; - } - } - - if ((i < rgn->cnt - 1) && lmb_regions_adjacent(rgn, i, i+1)) { - lmb_coalesce_regions(rgn, i, i+1); - coalesced++; - } - - if (coalesced) - return coalesced; - if (rgn->cnt >= MAX_LMB_REGIONS) - return -1; - - /* Couldn't coalesce the LMB, so add it to the sorted table. */ - for (i = rgn->cnt - 1; i >= 0; i--) { - if (base < rgn->region[i].base) { - rgn->region[i+1].base = rgn->region[i].base; - rgn->region[i+1].size = rgn->region[i].size; - } else { - rgn->region[i+1].base = base; - rgn->region[i+1].size = size; - break; - } - } - - if (base < rgn->region[0].base) { - rgn->region[0].base = base; - rgn->region[0].size = size; - } - rgn->cnt++; - - return 0; -} - -long lmb_add(u64 base, u64 size) -{ - struct lmb_region *_rgn = &lmb.memory; - - /* On pSeries LPAR systems, the first LMB is our RMO region. */ - if (base == 0) - lmb.rmo_size = size; - - return lmb_add_region(_rgn, base, size); - -} - -long lmb_remove(u64 base, u64 size) -{ - struct lmb_region *rgn = &(lmb.memory); - u64 rgnbegin, rgnend; - u64 end = base + size; - int i; - - rgnbegin = rgnend = 0; /* supress gcc warnings */ - - /* Find the region where (base, size) belongs to */ - for (i=0; i < rgn->cnt; i++) { - rgnbegin = rgn->region[i].base; - rgnend = rgnbegin + rgn->region[i].size; - - if ((rgnbegin <= base) && (end <= rgnend)) - break; - } - - /* Didn't find the region */ - if (i == rgn->cnt) - return -1; - - /* Check to see if we are removing entire region */ - if ((rgnbegin == base) && (rgnend == end)) { - lmb_remove_region(rgn, i); - return 0; - } - - /* Check to see if region is matching at the front */ - if (rgnbegin == base) { - rgn->region[i].base = end; - rgn->region[i].size -= size; - return 0; - } - - /* Check to see if the region is matching at the end */ - if (rgnend == end) { - rgn->region[i].size -= size; - return 0; - } - - /* - * We need to split the entry - adjust the current one to the - * beginging of the hole and add the region after hole. - */ - rgn->region[i].size = base - rgn->region[i].base; - return lmb_add_region(rgn, end, rgnend - end); -} - -long __init lmb_reserve(u64 base, u64 size) -{ - struct lmb_region *_rgn = &lmb.reserved; - - BUG_ON(0 == size); - - return lmb_add_region(_rgn, base, size); -} - -long lmb_overlaps_region(struct lmb_region *rgn, u64 base, u64 size) -{ - unsigned long i; - - for (i = 0; i < rgn->cnt; i++) { - u64 rgnbase = rgn->region[i].base; - u64 rgnsize = rgn->region[i].size; - if (lmb_addrs_overlap(base, size, rgnbase, rgnsize)) - break; - } - - return (i < rgn->cnt) ? i : -1; -} - -static u64 lmb_align_down(u64 addr, u64 size) -{ - return addr & ~(size - 1); -} - -static u64 lmb_align_up(u64 addr, u64 size) -{ - return (addr + (size - 1)) & ~(size - 1); -} - -static u64 __init lmb_alloc_nid_unreserved(u64 start, u64 end, - u64 size, u64 align) -{ - u64 base, res_base; - long j; - - base = lmb_align_down((end - size), align); - while (start <= base) { - j = lmb_overlaps_region(&lmb.reserved, base, size); - if (j < 0) { - /* this area isn't reserved, take it */ - if (lmb_add_region(&lmb.reserved, base, size) < 0) - base = ~(u64)0; - return base; - } - res_base = lmb.reserved.region[j].base; - if (res_base < size) - break; - base = lmb_align_down(res_base - size, align); - } - - return ~(u64)0; -} - -static u64 __init lmb_alloc_nid_region(struct lmb_property *mp, - u64 (*nid_range)(u64, u64, int *), - u64 size, u64 align, int nid) -{ - u64 start, end; - - start = mp->base; - end = start + mp->size; - - start = lmb_align_up(start, align); - while (start < end) { - u64 this_end; - int this_nid; - - this_end = nid_range(start, end, &this_nid); - if (this_nid == nid) { - u64 ret = lmb_alloc_nid_unreserved(start, this_end, - size, align); - if (ret != ~(u64)0) - return ret; - } - start = this_end; - } - - return ~(u64)0; -} - -u64 __init lmb_alloc_nid(u64 size, u64 align, int nid, - u64 (*nid_range)(u64 start, u64 end, int *nid)) -{ - struct lmb_region *mem = &lmb.memory; - int i; - - BUG_ON(0 == size); - - size = lmb_align_up(size, align); - - for (i = 0; i < mem->cnt; i++) { - u64 ret = lmb_alloc_nid_region(&mem->region[i], - nid_range, - size, align, nid); - if (ret != ~(u64)0) - return ret; - } - - return lmb_alloc(size, align); -} - -u64 __init lmb_alloc(u64 size, u64 align) -{ - return lmb_alloc_base(size, align, LMB_ALLOC_ANYWHERE); -} - -u64 __init lmb_alloc_base(u64 size, u64 align, u64 max_addr) -{ - u64 alloc; - - alloc = __lmb_alloc_base(size, align, max_addr); - - if (alloc == 0) - panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", - (unsigned long long) size, (unsigned long long) max_addr); - - return alloc; -} - -u64 __init __lmb_alloc_base(u64 size, u64 align, u64 max_addr) -{ - long i, j; - u64 base = 0; - u64 res_base; - - BUG_ON(0 == size); - - size = lmb_align_up(size, align); - - /* On some platforms, make sure we allocate lowmem */ - /* Note that LMB_REAL_LIMIT may be LMB_ALLOC_ANYWHERE */ - if (max_addr == LMB_ALLOC_ANYWHERE) - max_addr = LMB_REAL_LIMIT; - - for (i = lmb.memory.cnt - 1; i >= 0; i--) { - u64 lmbbase = lmb.memory.region[i].base; - u64 lmbsize = lmb.memory.region[i].size; - - if (lmbsize < size) - continue; - if (max_addr == LMB_ALLOC_ANYWHERE) - base = lmb_align_down(lmbbase + lmbsize - size, align); - else if (lmbbase < max_addr) { - base = min(lmbbase + lmbsize, max_addr); - base = lmb_align_down(base - size, align); - } else - continue; - - while (base && lmbbase <= base) { - j = lmb_overlaps_region(&lmb.reserved, base, size); - if (j < 0) { - /* this area isn't reserved, take it */ - if (lmb_add_region(&lmb.reserved, base, size) < 0) - return 0; - return base; - } - res_base = lmb.reserved.region[j].base; - if (res_base < size) - break; - base = lmb_align_down(res_base - size, align); - } - } - return 0; -} - -/* You must call lmb_analyze() before this. */ -u64 __init lmb_phys_mem_size(void) -{ - return lmb.memory.size; -} - -u64 lmb_end_of_DRAM(void) -{ - int idx = lmb.memory.cnt - 1; - - return (lmb.memory.region[idx].base + lmb.memory.region[idx].size); -} - -/* You must call lmb_analyze() after this. */ -void __init lmb_enforce_memory_limit(u64 memory_limit) -{ - unsigned long i; - u64 limit; - struct lmb_property *p; - - if (!memory_limit) - return; - - /* Truncate the lmb regions to satisfy the memory limit. */ - limit = memory_limit; - for (i = 0; i < lmb.memory.cnt; i++) { - if (limit > lmb.memory.region[i].size) { - limit -= lmb.memory.region[i].size; - continue; - } - - lmb.memory.region[i].size = limit; - lmb.memory.cnt = i + 1; - break; - } - - if (lmb.memory.region[0].size < lmb.rmo_size) - lmb.rmo_size = lmb.memory.region[0].size; - - memory_limit = lmb_end_of_DRAM(); - - /* And truncate any reserves above the limit also. */ - for (i = 0; i < lmb.reserved.cnt; i++) { - p = &lmb.reserved.region[i]; - - if (p->base > memory_limit) - p->size = 0; - else if ((p->base + p->size) > memory_limit) - p->size = memory_limit - p->base; - - if (p->size == 0) { - lmb_remove_region(&lmb.reserved, i); - i--; - } - } -} - -int __init lmb_is_reserved(u64 addr) -{ - int i; - - for (i = 0; i < lmb.reserved.cnt; i++) { - u64 upper = lmb.reserved.region[i].base + - lmb.reserved.region[i].size - 1; - if ((addr >= lmb.reserved.region[i].base) && (addr <= upper)) - return 1; - } - return 0; -} - -int lmb_is_region_reserved(u64 base, u64 size) -{ - return lmb_overlaps_region(&lmb.reserved, base, size); -} - -/* - * Given a <base, len>, find which memory regions belong to this range. - * Adjust the request and return a contiguous chunk. - */ -int lmb_find(struct lmb_property *res) -{ - int i; - u64 rstart, rend; - - rstart = res->base; - rend = rstart + res->size - 1; - - for (i = 0; i < lmb.memory.cnt; i++) { - u64 start = lmb.memory.region[i].base; - u64 end = start + lmb.memory.region[i].size - 1; - - if (start > rend) - return -1; - - if ((end >= rstart) && (start < rend)) { - /* adjust the request */ - if (rstart < start) - rstart = start; - if (rend > end) - rend = end; - res->base = rstart; - res->size = rend - rstart + 1; - return 0; - } - } - return -1; -} diff --git a/lib/nlattr.c b/lib/nlattr.c index c4706eb98d3d..5021cbc34411 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -15,7 +15,7 @@ #include <linux/types.h> #include <net/netlink.h> -static u16 nla_attr_minlen[NLA_TYPE_MAX+1] __read_mostly = { +static const u16 nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), @@ -23,7 +23,7 @@ static u16 nla_attr_minlen[NLA_TYPE_MAX+1] __read_mostly = { [NLA_NESTED] = NLA_HDRLEN, }; -static int validate_nla(struct nlattr *nla, int maxtype, +static int validate_nla(const struct nlattr *nla, int maxtype, const struct nla_policy *policy) { const struct nla_policy *pt; @@ -115,10 +115,10 @@ static int validate_nla(struct nlattr *nla, int maxtype, * * Returns 0 on success or a negative error code. */ -int nla_validate(struct nlattr *head, int len, int maxtype, +int nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy) { - struct nlattr *nla; + const struct nlattr *nla; int rem, err; nla_for_each_attr(nla, head, len, rem) { @@ -167,16 +167,16 @@ nla_policy_len(const struct nla_policy *p, int n) * @policy: validation policy * * Parses a stream of attributes and stores a pointer to each attribute in - * the tb array accessable via the attribute type. Attributes with a type + * the tb array accessible via the attribute type. Attributes with a type * exceeding maxtype will be silently ignored for backwards compatibility * reasons. policy may be set to NULL if no validation is required. * * Returns 0 on success or a negative error code. */ -int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, - const struct nla_policy *policy) +int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, + int len, const struct nla_policy *policy) { - struct nlattr *nla; + const struct nlattr *nla; int rem, err; memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); @@ -191,7 +191,7 @@ int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, goto errout; } - tb[type] = nla; + tb[type] = (struct nlattr *)nla; } } @@ -212,14 +212,14 @@ errout: * * Returns the first attribute in the stream matching the specified type. */ -struct nlattr *nla_find(struct nlattr *head, int len, int attrtype) +struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype) { - struct nlattr *nla; + const struct nlattr *nla; int rem; nla_for_each_attr(nla, head, len, rem) if (nla_type(nla) == attrtype) - return nla; + return (struct nlattr *)nla; return NULL; } diff --git a/lib/parser.c b/lib/parser.c index fb34977246bb..6e89eca5cca0 100644 --- a/lib/parser.c +++ b/lib/parser.c @@ -128,12 +128,13 @@ static int match_number(substring_t *s, int *result, int base) char *endp; char *buf; int ret; + size_t len = s->to - s->from; - buf = kmalloc(s->to - s->from + 1, GFP_KERNEL); + buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) return -ENOMEM; - memcpy(buf, s->from, s->to - s->from); - buf[s->to - s->from] = '\0'; + memcpy(buf, s->from, len); + buf[len] = '\0'; *result = simple_strtol(buf, &endp, base); ret = 0; if (endp == buf) diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index aeaa6d734447..28f2c33c6b53 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -8,10 +8,53 @@ #include <linux/init.h> #include <linux/cpu.h> #include <linux/module.h> +#include <linux/debugobjects.h> static LIST_HEAD(percpu_counters); static DEFINE_MUTEX(percpu_counters_lock); +#ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER + +static struct debug_obj_descr percpu_counter_debug_descr; + +static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state) +{ + struct percpu_counter *fbc = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + percpu_counter_destroy(fbc); + debug_object_free(fbc, &percpu_counter_debug_descr); + return 1; + default: + return 0; + } +} + +static struct debug_obj_descr percpu_counter_debug_descr = { + .name = "percpu_counter", + .fixup_free = percpu_counter_fixup_free, +}; + +static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) +{ + debug_object_init(fbc, &percpu_counter_debug_descr); + debug_object_activate(fbc, &percpu_counter_debug_descr); +} + +static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) +{ + debug_object_deactivate(fbc, &percpu_counter_debug_descr); + debug_object_free(fbc, &percpu_counter_debug_descr); +} + +#else /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ +static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) +{ } +static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) +{ } +#endif /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ + void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { int cpu; @@ -29,20 +72,18 @@ EXPORT_SYMBOL(percpu_counter_set); void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; - s32 *pcount; - int cpu = get_cpu(); - pcount = per_cpu_ptr(fbc->counters, cpu); - count = *pcount + amount; + preempt_disable(); + count = __this_cpu_read(*fbc->counters) + amount; if (count >= batch || count <= -batch) { spin_lock(&fbc->lock); fbc->count += count; - *pcount = 0; + __this_cpu_write(*fbc->counters, 0); spin_unlock(&fbc->lock); } else { - *pcount = count; + __this_cpu_write(*fbc->counters, count); } - put_cpu(); + preempt_enable(); } EXPORT_SYMBOL(__percpu_counter_add); @@ -75,7 +116,11 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, fbc->counters = alloc_percpu(s32); if (!fbc->counters) return -ENOMEM; + + debug_percpu_counter_activate(fbc); + #ifdef CONFIG_HOTPLUG_CPU + INIT_LIST_HEAD(&fbc->list); mutex_lock(&percpu_counters_lock); list_add(&fbc->list, &percpu_counters); mutex_unlock(&percpu_counters_lock); @@ -89,6 +134,8 @@ void percpu_counter_destroy(struct percpu_counter *fbc) if (!fbc->counters) return; + debug_percpu_counter_deactivate(fbc); + #ifdef CONFIG_HOTPLUG_CPU mutex_lock(&percpu_counters_lock); list_del(&fbc->list); @@ -137,6 +184,33 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb, return NOTIFY_OK; } +/* + * Compare counter against given value. + * Return 1 if greater, 0 if equal and -1 if less + */ +int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) +{ + s64 count; + + count = percpu_counter_read(fbc); + /* Check to see if rough count will be sufficient for comparison */ + if (abs(count - rhs) > (percpu_counter_batch*num_online_cpus())) { + if (count > rhs) + return 1; + else + return -1; + } + /* Need to use precise count */ + count = percpu_counter_sum(fbc); + if (count > rhs) + return 1; + else if (count < rhs) + return -1; + else + return 0; +} +EXPORT_SYMBOL(percpu_counter_compare); + static int __init percpu_counter_startup(void) { compute_batch_value(); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 92cdd9936e3d..5086bb962b4d 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -28,7 +28,6 @@ #include <linux/slab.h> #include <linux/notifier.h> #include <linux/cpu.h> -#include <linux/gfp.h> #include <linux/string.h> #include <linux/bitops.h> #include <linux/rcupdate.h> @@ -50,7 +49,7 @@ struct radix_tree_node { unsigned int height; /* Height from the bottom */ unsigned int count; struct rcu_head rcu_head; - void *slots[RADIX_TREE_MAP_SIZE]; + void __rcu *slots[RADIX_TREE_MAP_SIZE]; unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; }; @@ -83,6 +82,16 @@ struct radix_tree_preload { }; static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; +static inline void *ptr_to_indirect(void *ptr) +{ + return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR); +} + +static inline void *indirect_to_ptr(void *ptr) +{ + return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); +} + static inline gfp_t root_gfp_mask(struct radix_tree_root *root) { return root->gfp_mask & __GFP_BITS_MASK; @@ -175,14 +184,16 @@ static void radix_tree_node_rcu_free(struct rcu_head *head) { struct radix_tree_node *node = container_of(head, struct radix_tree_node, rcu_head); + int i; /* * must only free zeroed nodes into the slab. radix_tree_shrink * can leave us with a non-NULL entry in the first slot, so clear * that here to make sure. */ - tag_clear(node, 0, 0); - tag_clear(node, 1, 0); + for (i = 0; i < RADIX_TREE_MAX_TAGS; i++) + tag_clear(node, i, 0); + node->slots[0] = NULL; node->count = 0; @@ -264,7 +275,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) return -ENOMEM; /* Increase the height. */ - node->slots[0] = radix_tree_indirect_to_ptr(root->rnode); + node->slots[0] = indirect_to_ptr(root->rnode); /* Propagate the aggregated tag info into the new root */ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { @@ -275,7 +286,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) newheight = root->height+1; node->height = newheight; node->count = 1; - node = radix_tree_ptr_to_indirect(node); + node = ptr_to_indirect(node); rcu_assign_pointer(root->rnode, node); root->height = newheight; } while (height > root->height); @@ -308,7 +319,7 @@ int radix_tree_insert(struct radix_tree_root *root, return error; } - slot = radix_tree_indirect_to_ptr(root->rnode); + slot = indirect_to_ptr(root->rnode); height = root->height; shift = (height-1) * RADIX_TREE_MAP_SHIFT; @@ -324,8 +335,7 @@ int radix_tree_insert(struct radix_tree_root *root, rcu_assign_pointer(node->slots[offset], slot); node->count++; } else - rcu_assign_pointer(root->rnode, - radix_tree_ptr_to_indirect(slot)); + rcu_assign_pointer(root->rnode, ptr_to_indirect(slot)); } /* Go a level down */ @@ -364,7 +374,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, unsigned int height, shift; struct radix_tree_node *node, **slot; - node = rcu_dereference(root->rnode); + node = rcu_dereference_raw(root->rnode); if (node == NULL) return NULL; @@ -373,7 +383,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, return NULL; return is_slot ? (void *)&root->rnode : node; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); height = node->height; if (index > radix_tree_maxindex(height)) @@ -384,7 +394,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, do { slot = (struct radix_tree_node **) (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK)); - node = rcu_dereference(*slot); + node = rcu_dereference_raw(*slot); if (node == NULL) return NULL; @@ -392,7 +402,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, height--; } while (height > 0); - return is_slot ? (void *)slot:node; + return is_slot ? (void *)slot : indirect_to_ptr(node); } /** @@ -454,7 +464,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root, height = root->height; BUG_ON(index > radix_tree_maxindex(height)); - slot = radix_tree_indirect_to_ptr(root->rnode); + slot = indirect_to_ptr(root->rnode); shift = (height - 1) * RADIX_TREE_MAP_SHIFT; while (height > 0) { @@ -508,7 +518,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, shift = (height - 1) * RADIX_TREE_MAP_SHIFT; pathp->node = NULL; - slot = radix_tree_indirect_to_ptr(root->rnode); + slot = indirect_to_ptr(root->rnode); while (height > 0) { int offset; @@ -556,6 +566,10 @@ EXPORT_SYMBOL(radix_tree_tag_clear); * * 0: tag not present or not set * 1: tag set + * + * Note that the return value of this function may not be relied on, even if + * the RCU lock is held, unless tag modification and node deletion are excluded + * from concurrency. */ int radix_tree_tag_get(struct radix_tree_root *root, unsigned long index, unsigned int tag) @@ -568,13 +582,13 @@ int radix_tree_tag_get(struct radix_tree_root *root, if (!root_tag_get(root, tag)) return 0; - node = rcu_dereference(root->rnode); + node = rcu_dereference_raw(root->rnode); if (node == NULL) return 0; if (!radix_tree_is_indirect_ptr(node)) return (index == 0); - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); height = node->height; if (index > radix_tree_maxindex(height)) @@ -596,13 +610,9 @@ int radix_tree_tag_get(struct radix_tree_root *root, */ if (!tag_get(node, tag, offset)) saw_unset_tag = 1; - if (height == 1) { - int ret = tag_get(node, tag, offset); - - BUG_ON(ret && saw_unset_tag); - return !!ret; - } - node = rcu_dereference(node->slots[offset]); + if (height == 1) + return !!tag_get(node, tag, offset); + node = rcu_dereference_raw(node->slots[offset]); shift -= RADIX_TREE_MAP_SHIFT; height--; } @@ -610,6 +620,134 @@ int radix_tree_tag_get(struct radix_tree_root *root, EXPORT_SYMBOL(radix_tree_tag_get); /** + * radix_tree_range_tag_if_tagged - for each item in given range set given + * tag if item has another tag set + * @root: radix tree root + * @first_indexp: pointer to a starting index of a range to scan + * @last_index: last index of a range to scan + * @nr_to_tag: maximum number items to tag + * @iftag: tag index to test + * @settag: tag index to set if tested tag is set + * + * This function scans range of radix tree from first_index to last_index + * (inclusive). For each item in the range if iftag is set, the function sets + * also settag. The function stops either after tagging nr_to_tag items or + * after reaching last_index. + * + * The tags must be set from the leaf level only and propagated back up the + * path to the root. We must do this so that we resolve the full path before + * setting any tags on intermediate nodes. If we set tags as we descend, then + * we can get to the leaf node and find that the index that has the iftag + * set is outside the range we are scanning. This reults in dangling tags and + * can lead to problems with later tag operations (e.g. livelocks on lookups). + * + * The function returns number of leaves where the tag was set and sets + * *first_indexp to the first unscanned index. + * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must + * be prepared to handle that. + */ +unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, + unsigned long *first_indexp, unsigned long last_index, + unsigned long nr_to_tag, + unsigned int iftag, unsigned int settag) +{ + unsigned int height = root->height; + struct radix_tree_path path[height]; + struct radix_tree_path *pathp = path; + struct radix_tree_node *slot; + unsigned int shift; + unsigned long tagged = 0; + unsigned long index = *first_indexp; + + last_index = min(last_index, radix_tree_maxindex(height)); + if (index > last_index) + return 0; + if (!nr_to_tag) + return 0; + if (!root_tag_get(root, iftag)) { + *first_indexp = last_index + 1; + return 0; + } + if (height == 0) { + *first_indexp = last_index + 1; + root_tag_set(root, settag); + return 1; + } + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + slot = indirect_to_ptr(root->rnode); + + /* + * we fill the path from (root->height - 2) to 0, leaving the index at + * (root->height - 1) as a terminator. Zero the node in the terminator + * so that we can use this to end walk loops back up the path. + */ + path[height - 1].node = NULL; + + for (;;) { + int offset; + + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + if (!slot->slots[offset]) + goto next; + if (!tag_get(slot, iftag, offset)) + goto next; + if (height > 1) { + /* Go down one level */ + height--; + shift -= RADIX_TREE_MAP_SHIFT; + path[height - 1].node = slot; + path[height - 1].offset = offset; + slot = slot->slots[offset]; + continue; + } + + /* tag the leaf */ + tagged++; + tag_set(slot, settag, offset); + + /* walk back up the path tagging interior nodes */ + pathp = &path[0]; + while (pathp->node) { + /* stop if we find a node with the tag already set */ + if (tag_get(pathp->node, settag, pathp->offset)) + break; + tag_set(pathp->node, settag, pathp->offset); + pathp++; + } + +next: + /* Go to next item at level determined by 'shift' */ + index = ((index >> shift) + 1) << shift; + /* Overflow can happen when last_index is ~0UL... */ + if (index > last_index || !index) + break; + if (tagged >= nr_to_tag) + break; + while (((index >> shift) & RADIX_TREE_MAP_MASK) == 0) { + /* + * We've fully scanned this node. Go up. Because + * last_index is guaranteed to be in the tree, what + * we do below cannot wander astray. + */ + slot = path[height - 1].node; + height++; + shift += RADIX_TREE_MAP_SHIFT; + } + } + /* + * The iftag must have been set somewhere because otherwise + * we would return immediated at the beginning of the function + */ + root_tag_set(root, settag); + *first_indexp = index; + + return tagged; +} +EXPORT_SYMBOL(radix_tree_range_tag_if_tagged); + + +/** * radix_tree_next_hole - find the next hole (not-present entry) * @root: tree root * @index: index key @@ -657,7 +795,7 @@ EXPORT_SYMBOL(radix_tree_next_hole); * * Returns: the index of the hole if found, otherwise returns an index * outside of the set specified (in which case 'index - return >= max_scan' - * will be true). In rare cases of wrap-around, LONG_MAX will be returned. + * will be true). In rare cases of wrap-around, ULONG_MAX will be returned. * * radix_tree_next_hole may be called under rcu_read_lock. However, like * radix_tree_gang_lookup, this will not atomically search a snapshot of @@ -675,7 +813,7 @@ unsigned long radix_tree_prev_hole(struct radix_tree_root *root, if (!radix_tree_lookup(root, index)) break; index--; - if (index == LONG_MAX) + if (index == ULONG_MAX) break; } @@ -711,7 +849,7 @@ __lookup(struct radix_tree_node *slot, void ***results, unsigned long index, } shift -= RADIX_TREE_MAP_SHIFT; - slot = rcu_dereference(slot->slots[i]); + slot = rcu_dereference_raw(slot->slots[i]); if (slot == NULL) goto out; } @@ -758,7 +896,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long cur_index = first_index; unsigned int ret; - node = rcu_dereference(root->rnode); + node = rcu_dereference_raw(root->rnode); if (!node) return 0; @@ -768,7 +906,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, results[0] = node; return 1; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); @@ -787,7 +925,8 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, slot = *(((void ***)results)[ret + i]); if (!slot) continue; - results[ret + nr_found] = rcu_dereference(slot); + results[ret + nr_found] = + indirect_to_ptr(rcu_dereference_raw(slot)); nr_found++; } ret += nr_found; @@ -826,7 +965,7 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, unsigned long cur_index = first_index; unsigned int ret; - node = rcu_dereference(root->rnode); + node = rcu_dereference_raw(root->rnode); if (!node) return 0; @@ -836,7 +975,7 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, results[0] = (void **)&root->rnode; return 1; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); @@ -915,7 +1054,7 @@ __lookup_tag(struct radix_tree_node *slot, void ***results, unsigned long index, } } shift -= RADIX_TREE_MAP_SHIFT; - slot = rcu_dereference(slot->slots[i]); + slot = rcu_dereference_raw(slot->slots[i]); if (slot == NULL) break; } @@ -951,7 +1090,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, if (!root_tag_get(root, tag)) return 0; - node = rcu_dereference(root->rnode); + node = rcu_dereference_raw(root->rnode); if (!node) return 0; @@ -961,7 +1100,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, results[0] = node; return 1; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); @@ -980,7 +1119,8 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, slot = *(((void ***)results)[ret + i]); if (!slot) continue; - results[ret + nr_found] = rcu_dereference(slot); + results[ret + nr_found] = + indirect_to_ptr(rcu_dereference_raw(slot)); nr_found++; } ret += nr_found; @@ -1020,7 +1160,7 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, if (!root_tag_get(root, tag)) return 0; - node = rcu_dereference(root->rnode); + node = rcu_dereference_raw(root->rnode); if (!node) return 0; @@ -1030,7 +1170,7 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, results[0] = (void **)&root->rnode; return 1; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); @@ -1066,7 +1206,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) void *newptr; BUG_ON(!radix_tree_is_indirect_ptr(to_free)); - to_free = radix_tree_indirect_to_ptr(to_free); + to_free = indirect_to_ptr(to_free); /* * The candidate node has more than one child, or its child @@ -1079,16 +1219,39 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) /* * We don't need rcu_assign_pointer(), since we are simply - * moving the node from one part of the tree to another. If - * it was safe to dereference the old pointer to it + * moving the node from one part of the tree to another: if it + * was safe to dereference the old pointer to it * (to_free->slots[0]), it will be safe to dereference the new - * one (root->rnode). + * one (root->rnode) as far as dependent read barriers go. */ newptr = to_free->slots[0]; if (root->height > 1) - newptr = radix_tree_ptr_to_indirect(newptr); + newptr = ptr_to_indirect(newptr); root->rnode = newptr; root->height--; + + /* + * We have a dilemma here. The node's slot[0] must not be + * NULLed in case there are concurrent lookups expecting to + * find the item. However if this was a bottom-level node, + * then it may be subject to the slot pointer being visible + * to callers dereferencing it. If item corresponding to + * slot[0] is subsequently deleted, these callers would expect + * their slot to become empty sooner or later. + * + * For example, lockless pagecache will look up a slot, deref + * the page pointer, and if the page is 0 refcount it means it + * was concurrently deleted from pagecache so try the deref + * again. Fortunately there is already a requirement for logic + * to retry the entire slot lookup -- the indirect pointer + * problem (replacing direct root node with an indirect pointer + * also results in a stale slot). So tag the slot as indirect + * to force callers to retry. + */ + if (root->height == 0) + *((unsigned long *)&to_free->slots[0]) |= + RADIX_TREE_INDIRECT_PTR; + radix_tree_node_free(to_free); } } @@ -1125,7 +1288,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) root->rnode = NULL; goto out; } - slot = radix_tree_indirect_to_ptr(slot); + slot = indirect_to_ptr(slot); shift = (height - 1) * RADIX_TREE_MAP_SHIFT; pathp->node = NULL; @@ -1167,8 +1330,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) radix_tree_node_free(to_free); if (pathp->node->count) { - if (pathp->node == - radix_tree_indirect_to_ptr(root->rnode)) + if (pathp->node == indirect_to_ptr(root->rnode)) radix_tree_shrink(root); goto out; } diff --git a/lib/raid6/.gitignore b/lib/raid6/.gitignore new file mode 100644 index 000000000000..162becacf97c --- /dev/null +++ b/lib/raid6/.gitignore @@ -0,0 +1,4 @@ +mktables +altivec*.c +int*.c +tables.c diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile new file mode 100644 index 000000000000..8a38102770f3 --- /dev/null +++ b/lib/raid6/Makefile @@ -0,0 +1,75 @@ +obj-$(CONFIG_RAID6_PQ) += raid6_pq.o + +raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ + int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \ + altivec8.o mmx.o sse1.o sse2.o +hostprogs-y += mktables + +quiet_cmd_unroll = UNROLL $@ + cmd_unroll = $(AWK) -f$(srctree)/$(src)/unroll.awk -vN=$(UNROLL) \ + < $< > $@ || ( rm -f $@ && exit 1 ) + +ifeq ($(CONFIG_ALTIVEC),y) +altivec_flags := -maltivec -mabi=altivec +endif + +targets += int1.c +$(obj)/int1.c: UNROLL := 1 +$(obj)/int1.c: $(src)/int.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += int2.c +$(obj)/int2.c: UNROLL := 2 +$(obj)/int2.c: $(src)/int.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += int4.c +$(obj)/int4.c: UNROLL := 4 +$(obj)/int4.c: $(src)/int.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += int8.c +$(obj)/int8.c: UNROLL := 8 +$(obj)/int8.c: $(src)/int.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += int16.c +$(obj)/int16.c: UNROLL := 16 +$(obj)/int16.c: $(src)/int.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += int32.c +$(obj)/int32.c: UNROLL := 32 +$(obj)/int32.c: $(src)/int.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +CFLAGS_altivec1.o += $(altivec_flags) +targets += altivec1.c +$(obj)/altivec1.c: UNROLL := 1 +$(obj)/altivec1.c: $(src)/altivec.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +CFLAGS_altivec2.o += $(altivec_flags) +targets += altivec2.c +$(obj)/altivec2.c: UNROLL := 2 +$(obj)/altivec2.c: $(src)/altivec.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +CFLAGS_altivec4.o += $(altivec_flags) +targets += altivec4.c +$(obj)/altivec4.c: UNROLL := 4 +$(obj)/altivec4.c: $(src)/altivec.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +CFLAGS_altivec8.o += $(altivec_flags) +targets += altivec8.c +$(obj)/altivec8.c: UNROLL := 8 +$(obj)/altivec8.c: $(src)/altivec.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +quiet_cmd_mktable = TABLE $@ + cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) + +targets += tables.c +$(obj)/tables.c: $(obj)/mktables FORCE + $(call if_changed,mktable) diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c new file mode 100644 index 000000000000..b595f560bee7 --- /dev/null +++ b/lib/raid6/algos.c @@ -0,0 +1,154 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/algos.c + * + * Algorithm list and algorithm selection for RAID-6 + */ + +#include <linux/raid/pq.h> +#ifndef __KERNEL__ +#include <sys/mman.h> +#include <stdio.h> +#else +#include <linux/gfp.h> +#if !RAID6_USE_EMPTY_ZERO_PAGE +/* In .bss so it's zeroed */ +const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); +EXPORT_SYMBOL(raid6_empty_zero_page); +#endif +#endif + +struct raid6_calls raid6_call; +EXPORT_SYMBOL_GPL(raid6_call); + +const struct raid6_calls * const raid6_algos[] = { + &raid6_intx1, + &raid6_intx2, + &raid6_intx4, + &raid6_intx8, +#if defined(__ia64__) + &raid6_intx16, + &raid6_intx32, +#endif +#if defined(__i386__) && !defined(__arch_um__) + &raid6_mmxx1, + &raid6_mmxx2, + &raid6_sse1x1, + &raid6_sse1x2, + &raid6_sse2x1, + &raid6_sse2x2, +#endif +#if defined(__x86_64__) && !defined(__arch_um__) + &raid6_sse2x1, + &raid6_sse2x2, + &raid6_sse2x4, +#endif +#ifdef CONFIG_ALTIVEC + &raid6_altivec1, + &raid6_altivec2, + &raid6_altivec4, + &raid6_altivec8, +#endif + NULL +}; + +#ifdef __KERNEL__ +#define RAID6_TIME_JIFFIES_LG2 4 +#else +/* Need more time to be stable in userspace */ +#define RAID6_TIME_JIFFIES_LG2 9 +#define time_before(x, y) ((x) < (y)) +#endif + +/* Try to pick the best algorithm */ +/* This code uses the gfmul table as convenient data set to abuse */ + +int __init raid6_select_algo(void) +{ + const struct raid6_calls * const * algo; + const struct raid6_calls * best; + char *syndromes; + void *dptrs[(65536/PAGE_SIZE)+2]; + int i, disks; + unsigned long perf, bestperf; + int bestprefer; + unsigned long j0, j1; + + disks = (65536/PAGE_SIZE)+2; + for ( i = 0 ; i < disks-2 ; i++ ) { + dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; + } + + /* Normal code - use a 2-page allocation to avoid D$ conflict */ + syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); + + if ( !syndromes ) { + printk("raid6: Yikes! No memory available.\n"); + return -ENOMEM; + } + + dptrs[disks-2] = syndromes; + dptrs[disks-1] = syndromes + PAGE_SIZE; + + bestperf = 0; bestprefer = 0; best = NULL; + + for ( algo = raid6_algos ; *algo ; algo++ ) { + if ( !(*algo)->valid || (*algo)->valid() ) { + perf = 0; + + preempt_disable(); + j0 = jiffies; + while ( (j1 = jiffies) == j0 ) + cpu_relax(); + while (time_before(jiffies, + j1 + (1<<RAID6_TIME_JIFFIES_LG2))) { + (*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs); + perf++; + } + preempt_enable(); + + if ( (*algo)->prefer > bestprefer || + ((*algo)->prefer == bestprefer && + perf > bestperf) ) { + best = *algo; + bestprefer = best->prefer; + bestperf = perf; + } + printk("raid6: %-8s %5ld MB/s\n", (*algo)->name, + (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); + } + } + + if (best) { + printk("raid6: using algorithm %s (%ld MB/s)\n", + best->name, + (bestperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); + raid6_call = *best; + } else + printk("raid6: Yikes! No algorithm found!\n"); + + free_pages((unsigned long)syndromes, 1); + + return best ? 0 : -EINVAL; +} + +static void raid6_exit(void) +{ + do { } while (0); +} + +subsys_initcall(raid6_select_algo); +module_exit(raid6_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID6 Q-syndrome calculations"); diff --git a/lib/raid6/altivec.uc b/lib/raid6/altivec.uc new file mode 100644 index 000000000000..2654d5c854be --- /dev/null +++ b/lib/raid6/altivec.uc @@ -0,0 +1,130 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6altivec$#.c + * + * $#-way unrolled portable integer math RAID-6 instruction set + * + * This file is postprocessed using unroll.awk + * + * <benh> hpa: in process, + * you can just "steal" the vec unit with enable_kernel_altivec() (but + * bracked this with preempt_disable/enable or in a lock) + */ + +#include <linux/raid/pq.h> + +#ifdef CONFIG_ALTIVEC + +#include <altivec.h> +#ifdef __KERNEL__ +# include <asm/system.h> +# include <asm/cputable.h> +#endif + +/* + * This is the C data type to use. We use a vector of + * signed char so vec_cmpgt() will generate the right + * instruction. + */ + +typedef vector signed char unative_t; + +#define NBYTES(x) ((vector signed char) {x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x}) +#define NSIZE sizeof(unative_t) + +/* + * The SHLBYTE() operation shifts each byte left by 1, *not* + * rolling over into the next byte + */ +static inline __attribute_const__ unative_t SHLBYTE(unative_t v) +{ + return vec_add(v,v); +} + +/* + * The MASK() operation returns 0xFF in any byte for which the high + * bit is 1, 0x00 for any byte for which the high bit is 0. + */ +static inline __attribute_const__ unative_t MASK(unative_t v) +{ + unative_t zv = NBYTES(0); + + /* vec_cmpgt returns a vector bool char; thus the need for the cast */ + return (unative_t)vec_cmpgt(zv, v); +} + + +/* This is noinline to make damned sure that gcc doesn't move any of the + Altivec code around the enable/disable code */ +static void noinline +raid6_altivec$#_gen_syndrome_real(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + unative_t wd$$, wq$$, wp$$, w1$$, w2$$; + unative_t x1d = NBYTES(0x1d); + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { + wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; + for ( z = z0-1 ; z >= 0 ; z-- ) { + wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + wp$$ = vec_xor(wp$$, wd$$); + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + w2$$ = vec_and(w2$$, x1d); + w1$$ = vec_xor(w1$$, w2$$); + wq$$ = vec_xor(w1$$, wd$$); + } + *(unative_t *)&p[d+NSIZE*$$] = wp$$; + *(unative_t *)&q[d+NSIZE*$$] = wq$$; + } +} + +static void raid6_altivec$#_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + preempt_disable(); + enable_kernel_altivec(); + + raid6_altivec$#_gen_syndrome_real(disks, bytes, ptrs); + + preempt_enable(); +} + +int raid6_have_altivec(void); +#if $# == 1 +int raid6_have_altivec(void) +{ + /* This assumes either all CPUs have Altivec or none does */ +# ifdef __KERNEL__ + return cpu_has_feature(CPU_FTR_ALTIVEC); +# else + return 1; +# endif +} +#endif + +const struct raid6_calls raid6_altivec$# = { + raid6_altivec$#_gen_syndrome, + raid6_have_altivec, + "altivecx$#", + 0 +}; + +#endif /* CONFIG_ALTIVEC */ diff --git a/lib/raid6/int.uc b/lib/raid6/int.uc new file mode 100644 index 000000000000..d1e276a14fab --- /dev/null +++ b/lib/raid6/int.uc @@ -0,0 +1,117 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6int$#.c + * + * $#-way unrolled portable integer math RAID-6 instruction set + * + * This file is postprocessed using unroll.awk + */ + +#include <linux/raid/pq.h> + +/* + * This is the C data type to use + */ + +/* Change this from BITS_PER_LONG if there is something better... */ +#if BITS_PER_LONG == 64 +# define NBYTES(x) ((x) * 0x0101010101010101UL) +# define NSIZE 8 +# define NSHIFT 3 +# define NSTRING "64" +typedef u64 unative_t; +#else +# define NBYTES(x) ((x) * 0x01010101U) +# define NSIZE 4 +# define NSHIFT 2 +# define NSTRING "32" +typedef u32 unative_t; +#endif + + + +/* + * IA-64 wants insane amounts of unrolling. On other architectures that + * is just a waste of space. + */ +#if ($# <= 8) || defined(__ia64__) + + +/* + * These sub-operations are separate inlines since they can sometimes be + * specially optimized using architecture-specific hacks. + */ + +/* + * The SHLBYTE() operation shifts each byte left by 1, *not* + * rolling over into the next byte + */ +static inline __attribute_const__ unative_t SHLBYTE(unative_t v) +{ + unative_t vv; + + vv = (v << 1) & NBYTES(0xfe); + return vv; +} + +/* + * The MASK() operation returns 0xFF in any byte for which the high + * bit is 1, 0x00 for any byte for which the high bit is 0. + */ +static inline __attribute_const__ unative_t MASK(unative_t v) +{ + unative_t vv; + + vv = v & NBYTES(0x80); + vv = (vv << 1) - (vv >> 7); /* Overflow on the top bit is OK */ + return vv; +} + + +static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + unative_t wd$$, wq$$, wp$$, w1$$, w2$$; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { + wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; + for ( z = z0-1 ; z >= 0 ; z-- ) { + wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + wp$$ ^= wd$$; + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + w2$$ &= NBYTES(0x1d); + w1$$ ^= w2$$; + wq$$ = w1$$ ^ wd$$; + } + *(unative_t *)&p[d+NSIZE*$$] = wp$$; + *(unative_t *)&q[d+NSIZE*$$] = wq$$; + } +} + +const struct raid6_calls raid6_intx$# = { + raid6_int$#_gen_syndrome, + NULL, /* always valid */ + "int" NSTRING "x$#", + 0 +}; + +#endif diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c new file mode 100644 index 000000000000..3b1500843bba --- /dev/null +++ b/lib/raid6/mktables.c @@ -0,0 +1,132 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * mktables.c + * + * Make RAID-6 tables. This is a host user space program to be run at + * compile time. + */ + +#include <stdio.h> +#include <string.h> +#include <inttypes.h> +#include <stdlib.h> +#include <time.h> + +static uint8_t gfmul(uint8_t a, uint8_t b) +{ + uint8_t v = 0; + + while (b) { + if (b & 1) + v ^= a; + a = (a << 1) ^ (a & 0x80 ? 0x1d : 0); + b >>= 1; + } + + return v; +} + +static uint8_t gfpow(uint8_t a, int b) +{ + uint8_t v = 1; + + b %= 255; + if (b < 0) + b += 255; + + while (b) { + if (b & 1) + v = gfmul(v, a); + a = gfmul(a, a); + b >>= 1; + } + + return v; +} + +int main(int argc, char *argv[]) +{ + int i, j, k; + uint8_t v; + uint8_t exptbl[256], invtbl[256]; + + printf("#include <linux/raid/pq.h>\n"); + + /* Compute multiplication table */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfmul[256][256] =\n" + "{\n"); + for (i = 0; i < 256; i++) { + printf("\t{\n"); + for (j = 0; j < 256; j += 8) { + printf("\t\t"); + for (k = 0; k < 8; k++) + printf("0x%02x,%c", gfmul(i, j + k), + (k == 7) ? '\n' : ' '); + } + printf("\t},\n"); + } + printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfmul);\n"); + printf("#endif\n"); + + /* Compute power-of-2 table (exponent) */ + v = 1; + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfexp[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) { + exptbl[i + j] = v; + printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); + v = gfmul(v, 2); + if (v == 1) + v = 0; /* For entry 255, not a real entry */ + } + } + printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfexp);\n"); + printf("#endif\n"); + + /* Compute inverse table x^-1 == x^254 */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfinv[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) { + invtbl[i + j] = v = gfpow(i + j, 254); + printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); + } + } + printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfinv);\n"); + printf("#endif\n"); + + /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfexi[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) + printf("0x%02x,%c", invtbl[exptbl[i + j] ^ 1], + (j == 7) ? '\n' : ' '); + } + printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfexi);\n"); + printf("#endif\n"); + + return 0; +} diff --git a/lib/raid6/mmx.c b/lib/raid6/mmx.c new file mode 100644 index 000000000000..279347f23094 --- /dev/null +++ b/lib/raid6/mmx.c @@ -0,0 +1,142 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/mmx.c + * + * MMX implementation of RAID-6 syndrome functions + */ + +#if defined(__i386__) && !defined(__arch_um__) + +#include <linux/raid/pq.h> +#include "x86.h" + +/* Shared with raid6/sse1.c */ +const struct raid6_mmx_constants { + u64 x1d; +} raid6_mmx_constants = { + 0x1d1d1d1d1d1d1d1dULL, +}; + +static int raid6_have_mmx(void) +{ + /* Not really "boot_cpu" but "all_cpus" */ + return boot_cpu_has(X86_FEATURE_MMX); +} + +/* + * Plain MMX implementation + */ +static void raid6_mmx1_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); + asm volatile("pxor %mm5,%mm5"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 8 ) { + asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("movq %mm2,%mm4"); /* Q[0] */ + for ( z = z0-1 ; z >= 0 ; z-- ) { + asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm6,%mm2"); + asm volatile("pxor %mm6,%mm4"); + } + asm volatile("movq %%mm2,%0" : "=m" (p[d])); + asm volatile("pxor %mm2,%mm2"); + asm volatile("movq %%mm4,%0" : "=m" (q[d])); + asm volatile("pxor %mm4,%mm4"); + } + + kernel_fpu_end(); +} + +const struct raid6_calls raid6_mmxx1 = { + raid6_mmx1_gen_syndrome, + raid6_have_mmx, + "mmxx1", + 0 +}; + +/* + * Unrolled-by-2 MMX implementation + */ +static void raid6_mmx2_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); + asm volatile("pxor %mm5,%mm5"); /* Zero temp */ + asm volatile("pxor %mm7,%mm7"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 16 ) { + asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); + asm volatile("movq %mm2,%mm4"); /* Q[0] */ + asm volatile("movq %mm3,%mm6"); /* Q[1] */ + for ( z = z0-1 ; z >= 0 ; z-- ) { + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("pcmpgtb %mm6,%mm7"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("paddb %mm6,%mm6"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pand %mm0,%mm7"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm7,%mm6"); + asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d])); + asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8])); + asm volatile("pxor %mm5,%mm2"); + asm volatile("pxor %mm7,%mm3"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm7,%mm6"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm7,%mm7"); + } + asm volatile("movq %%mm2,%0" : "=m" (p[d])); + asm volatile("movq %%mm3,%0" : "=m" (p[d+8])); + asm volatile("movq %%mm4,%0" : "=m" (q[d])); + asm volatile("movq %%mm6,%0" : "=m" (q[d+8])); + } + + kernel_fpu_end(); +} + +const struct raid6_calls raid6_mmxx2 = { + raid6_mmx2_gen_syndrome, + raid6_have_mmx, + "mmxx2", + 0 +}; + +#endif diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c new file mode 100644 index 000000000000..8590d19cf522 --- /dev/null +++ b/lib/raid6/recov.c @@ -0,0 +1,132 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/recov.c + * + * RAID-6 data recovery in dual failure mode. In single failure mode, + * use the RAID-5 algorithm (or, in the case of Q failure, just reconstruct + * the syndrome.) + */ + +#include <linux/raid/pq.h> + +/* Recover two failed data blocks. */ +void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, + void **ptrs) +{ + u8 *p, *q, *dp, *dq; + u8 px, qx, db; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_gfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]]; + + /* Now do it... */ + while ( bytes-- ) { + px = *p ^ *dp; + qx = qmul[*q ^ *dq]; + *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */ + *dp++ = db ^ px; /* Reconstructed A */ + p++; q++; + } +} +EXPORT_SYMBOL_GPL(raid6_2data_recov); + +/* Recover failure of one data block plus the P block */ +void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + /* Now do it... */ + while ( bytes-- ) { + *p++ ^= *dq = qmul[*q ^ *dq]; + q++; dq++; + } +} +EXPORT_SYMBOL_GPL(raid6_datap_recov); + +#ifndef __KERNEL__ +/* Testing only */ + +/* Recover two failed blocks. */ +void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs) +{ + if ( faila > failb ) { + int tmp = faila; + faila = failb; + failb = tmp; + } + + if ( failb == disks-1 ) { + if ( faila == disks-2 ) { + /* P+Q failure. Just rebuild the syndrome. */ + raid6_call.gen_syndrome(disks, bytes, ptrs); + } else { + /* data+Q failure. Reconstruct data from P, + then rebuild syndrome. */ + /* NOT IMPLEMENTED - equivalent to RAID-5 */ + } + } else { + if ( failb == disks-2 ) { + /* data+P failure. */ + raid6_datap_recov(disks, bytes, faila, ptrs); + } else { + /* data+data failure. */ + raid6_2data_recov(disks, bytes, faila, failb, ptrs); + } + } +} + +#endif diff --git a/lib/raid6/sse1.c b/lib/raid6/sse1.c new file mode 100644 index 000000000000..10dd91948c07 --- /dev/null +++ b/lib/raid6/sse1.c @@ -0,0 +1,162 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/sse1.c + * + * SSE-1/MMXEXT implementation of RAID-6 syndrome functions + * + * This is really an MMX implementation, but it requires SSE-1 or + * AMD MMXEXT for prefetch support and a few other features. The + * support for nontemporal memory accesses is enough to make this + * worthwhile as a separate implementation. + */ + +#if defined(__i386__) && !defined(__arch_um__) + +#include <linux/raid/pq.h> +#include "x86.h" + +/* Defined in raid6/mmx.c */ +extern const struct raid6_mmx_constants { + u64 x1d; +} raid6_mmx_constants; + +static int raid6_have_sse1_or_mmxext(void) +{ + /* Not really boot_cpu but "all_cpus" */ + return boot_cpu_has(X86_FEATURE_MMX) && + (boot_cpu_has(X86_FEATURE_XMM) || + boot_cpu_has(X86_FEATURE_MMXEXT)); +} + +/* + * Plain SSE1 implementation + */ +static void raid6_sse11_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); + asm volatile("pxor %mm5,%mm5"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 8 ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); + asm volatile("movq %mm2,%mm4"); /* Q[0] */ + asm volatile("movq %0,%%mm6" : : "m" (dptr[z0-1][d])); + for ( z = z0-2 ; z >= 0 ; z-- ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm6,%mm2"); + asm volatile("pxor %mm6,%mm4"); + asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d])); + } + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm6,%mm2"); + asm volatile("pxor %mm6,%mm4"); + + asm volatile("movntq %%mm2,%0" : "=m" (p[d])); + asm volatile("movntq %%mm4,%0" : "=m" (q[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_sse1x1 = { + raid6_sse11_gen_syndrome, + raid6_have_sse1_or_mmxext, + "sse1x1", + 1 /* Has cache hints */ +}; + +/* + * Unrolled-by-2 SSE1 implementation + */ +static void raid6_sse12_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); + asm volatile("pxor %mm5,%mm5"); /* Zero temp */ + asm volatile("pxor %mm7,%mm7"); /* Zero temp */ + + /* We uniformly assume a single prefetch covers at least 16 bytes */ + for ( d = 0 ; d < bytes ; d += 16 ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); /* P[1] */ + asm volatile("movq %mm2,%mm4"); /* Q[0] */ + asm volatile("movq %mm3,%mm6"); /* Q[1] */ + for ( z = z0-1 ; z >= 0 ; z-- ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("pcmpgtb %mm6,%mm7"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("paddb %mm6,%mm6"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pand %mm0,%mm7"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm7,%mm6"); + asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d])); + asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8])); + asm volatile("pxor %mm5,%mm2"); + asm volatile("pxor %mm7,%mm3"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm7,%mm6"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm7,%mm7"); + } + asm volatile("movntq %%mm2,%0" : "=m" (p[d])); + asm volatile("movntq %%mm3,%0" : "=m" (p[d+8])); + asm volatile("movntq %%mm4,%0" : "=m" (q[d])); + asm volatile("movntq %%mm6,%0" : "=m" (q[d+8])); + } + + asm volatile("sfence" : :: "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_sse1x2 = { + raid6_sse12_gen_syndrome, + raid6_have_sse1_or_mmxext, + "sse1x2", + 1 /* Has cache hints */ +}; + +#endif diff --git a/lib/raid6/sse2.c b/lib/raid6/sse2.c new file mode 100644 index 000000000000..bc2d57daa589 --- /dev/null +++ b/lib/raid6/sse2.c @@ -0,0 +1,262 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/sse2.c + * + * SSE-2 implementation of RAID-6 syndrome functions + * + */ + +#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) + +#include <linux/raid/pq.h> +#include "x86.h" + +static const struct raid6_sse_constants { + u64 x1d[2]; +} raid6_sse_constants __attribute__((aligned(16))) = { + { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL }, +}; + +static int raid6_have_sse2(void) +{ + /* Not really boot_cpu but "all_cpus" */ + return boot_cpu_has(X86_FEATURE_MMX) && + boot_cpu_has(X86_FEATURE_FXSR) && + boot_cpu_has(X86_FEATURE_XMM) && + boot_cpu_has(X86_FEATURE_XMM2); +} + +/* + * Plain SSE2 implementation + */ +static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); + asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 16 ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); + asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ + asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d])); + for ( z = z0-2 ; z >= 0 ; z-- ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm6,%xmm2"); + asm volatile("pxor %xmm6,%xmm4"); + asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d])); + } + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm6,%xmm2"); + asm volatile("pxor %xmm6,%xmm4"); + + asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); + asm volatile("pxor %xmm2,%xmm2"); + asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); + asm volatile("pxor %xmm4,%xmm4"); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_sse2x1 = { + raid6_sse21_gen_syndrome, + raid6_have_sse2, + "sse2x1", + 1 /* Has cache hints */ +}; + +/* + * Unrolled-by-2 SSE2 implementation + */ +static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); + asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ + asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */ + + /* We uniformly assume a single prefetch covers at least 32 bytes */ + for ( d = 0 ; d < bytes ; d += 32 ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */ + asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ + asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */ + for ( z = z0-1 ; z >= 0 ; z-- ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d])); + asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm7,%xmm3"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + } + asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); + asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); + asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); + asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_sse2x2 = { + raid6_sse22_gen_syndrome, + raid6_have_sse2, + "sse2x2", + 1 /* Has cache hints */ +}; + +#endif + +#if defined(__x86_64__) && !defined(__arch_um__) + +/* + * Unrolled-by-4 SSE2 implementation + */ +static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0])); + asm volatile("pxor %xmm2,%xmm2"); /* P[0] */ + asm volatile("pxor %xmm3,%xmm3"); /* P[1] */ + asm volatile("pxor %xmm4,%xmm4"); /* Q[0] */ + asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ + asm volatile("pxor %xmm6,%xmm6"); /* Q[1] */ + asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */ + asm volatile("pxor %xmm10,%xmm10"); /* P[2] */ + asm volatile("pxor %xmm11,%xmm11"); /* P[3] */ + asm volatile("pxor %xmm12,%xmm12"); /* Q[2] */ + asm volatile("pxor %xmm13,%xmm13"); /* Zero temp */ + asm volatile("pxor %xmm14,%xmm14"); /* Q[3] */ + asm volatile("pxor %xmm15,%xmm15"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 64 ) { + for ( z = z0 ; z >= 0 ; z-- ) { + /* The second prefetch seems to improve performance... */ + asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); + asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32])); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("pcmpgtb %xmm12,%xmm13"); + asm volatile("pcmpgtb %xmm14,%xmm15"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("paddb %xmm12,%xmm12"); + asm volatile("paddb %xmm14,%xmm14"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pand %xmm0,%xmm13"); + asm volatile("pand %xmm0,%xmm15"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); + asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); + asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32])); + asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm7,%xmm3"); + asm volatile("pxor %xmm13,%xmm10"); + asm volatile("pxor %xmm15,%xmm11"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pxor %xmm13,%xmm13"); + asm volatile("pxor %xmm15,%xmm15"); + } + asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); + asm volatile("pxor %xmm2,%xmm2"); + asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); + asm volatile("pxor %xmm3,%xmm3"); + asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32])); + asm volatile("pxor %xmm10,%xmm10"); + asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48])); + asm volatile("pxor %xmm11,%xmm11"); + asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); + asm volatile("pxor %xmm4,%xmm4"); + asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); + asm volatile("pxor %xmm6,%xmm6"); + asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32])); + asm volatile("pxor %xmm12,%xmm12"); + asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48])); + asm volatile("pxor %xmm14,%xmm14"); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_sse2x4 = { + raid6_sse24_gen_syndrome, + raid6_have_sse2, + "sse2x4", + 1 /* Has cache hints */ +}; + +#endif diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile new file mode 100644 index 000000000000..aa651697b6dc --- /dev/null +++ b/lib/raid6/test/Makefile @@ -0,0 +1,72 @@ +# +# This is a simple Makefile to test some of the RAID-6 code +# from userspace. +# + +CC = gcc +OPTFLAGS = -O2 # Adjust as desired +CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) +LD = ld +AWK = awk -f +AR = ar +RANLIB = ranlib + +.c.o: + $(CC) $(CFLAGS) -c -o $@ $< + +%.c: ../%.c + cp -f $< $@ + +%.uc: ../%.uc + cp -f $< $@ + +all: raid6.a raid6test + +raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \ + altivec1.o altivec2.o altivec4.o altivec8.o recov.o algos.o \ + tables.o + rm -f $@ + $(AR) cq $@ $^ + $(RANLIB) $@ + +raid6test: test.c raid6.a + $(CC) $(CFLAGS) -o raid6test $^ + +altivec1.c: altivec.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=1 < altivec.uc > $@ + +altivec2.c: altivec.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=2 < altivec.uc > $@ + +altivec4.c: altivec.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=4 < altivec.uc > $@ + +altivec8.c: altivec.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=8 < altivec.uc > $@ + +int1.c: int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=1 < int.uc > $@ + +int2.c: int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=2 < int.uc > $@ + +int4.c: int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=4 < int.uc > $@ + +int8.c: int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=8 < int.uc > $@ + +int16.c: int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=16 < int.uc > $@ + +int32.c: int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=32 < int.uc > $@ + +tables.c: mktables + ./mktables > tables.c + +clean: + rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c tables.c raid6test + +spotless: clean + rm -f *~ diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c new file mode 100644 index 000000000000..7a930318b17d --- /dev/null +++ b/lib/raid6/test/test.c @@ -0,0 +1,124 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6test.c + * + * Test RAID-6 recovery with various algorithms + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <linux/raid/pq.h> + +#define NDISKS 16 /* Including P and Q */ + +const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); +struct raid6_calls raid6_call; + +char *dataptrs[NDISKS]; +char data[NDISKS][PAGE_SIZE]; +char recovi[PAGE_SIZE], recovj[PAGE_SIZE]; + +static void makedata(void) +{ + int i, j; + + for (i = 0; i < NDISKS; i++) { + for (j = 0; j < PAGE_SIZE; j++) + data[i][j] = rand(); + + dataptrs[i] = data[i]; + } +} + +static char disk_type(int d) +{ + switch (d) { + case NDISKS-2: + return 'P'; + case NDISKS-1: + return 'Q'; + default: + return 'D'; + } +} + +static int test_disks(int i, int j) +{ + int erra, errb; + + memset(recovi, 0xf0, PAGE_SIZE); + memset(recovj, 0xba, PAGE_SIZE); + + dataptrs[i] = recovi; + dataptrs[j] = recovj; + + raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)&dataptrs); + + erra = memcmp(data[i], recovi, PAGE_SIZE); + errb = memcmp(data[j], recovj, PAGE_SIZE); + + if (i < NDISKS-2 && j == NDISKS-1) { + /* We don't implement the DQ failure scenario, since it's + equivalent to a RAID-5 failure (XOR, then recompute Q) */ + erra = errb = 0; + } else { + printf("algo=%-8s faila=%3d(%c) failb=%3d(%c) %s\n", + raid6_call.name, + i, disk_type(i), + j, disk_type(j), + (!erra && !errb) ? "OK" : + !erra ? "ERRB" : + !errb ? "ERRA" : "ERRAB"); + } + + dataptrs[i] = data[i]; + dataptrs[j] = data[j]; + + return erra || errb; +} + +int main(int argc, char *argv[]) +{ + const struct raid6_calls *const *algo; + int i, j; + int err = 0; + + makedata(); + + for (algo = raid6_algos; *algo; algo++) { + if (!(*algo)->valid || (*algo)->valid()) { + raid6_call = **algo; + + /* Nuke syndromes */ + memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); + + /* Generate assumed good syndrome */ + raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, + (void **)&dataptrs); + + for (i = 0; i < NDISKS-1; i++) + for (j = i+1; j < NDISKS; j++) + err += test_disks(i, j); + } + printf("\n"); + } + + printf("\n"); + /* Pick the best algorithm test */ + raid6_select_algo(); + + if (err) + printf("\n*** ERRORS FOUND ***\n"); + + return err; +} diff --git a/lib/raid6/unroll.awk b/lib/raid6/unroll.awk new file mode 100644 index 000000000000..c6aa03631df8 --- /dev/null +++ b/lib/raid6/unroll.awk @@ -0,0 +1,20 @@ + +# This filter requires one command line option of form -vN=n +# where n must be a decimal number. +# +# Repeat each input line containing $$ n times, replacing $$ with 0...n-1. +# Replace each $# with n, and each $* with a single $. + +BEGIN { + n = N + 0 +} +{ + if (/\$\$/) { rep = n } else { rep = 1 } + for (i = 0; i < rep; ++i) { + tmp = $0 + gsub(/\$\$/, i, tmp) + gsub(/\$\#/, n, tmp) + gsub(/\$\*/, "$", tmp) + print tmp + } +} diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h new file mode 100644 index 000000000000..cb2a8c91c886 --- /dev/null +++ b/lib/raid6/x86.h @@ -0,0 +1,61 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/x86.h + * + * Definitions common to x86 and x86-64 RAID-6 code only + */ + +#ifndef LINUX_RAID_RAID6X86_H +#define LINUX_RAID_RAID6X86_H + +#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) + +#ifdef __KERNEL__ /* Real code */ + +#include <asm/i387.h> + +#else /* Dummy code for user space testing */ + +static inline void kernel_fpu_begin(void) +{ +} + +static inline void kernel_fpu_end(void) +{ +} + +#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ +#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions + * (fast save and restore) */ +#define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */ +#define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */ +#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ + +/* Should work well enough on modern CPUs for testing */ +static inline int boot_cpu_has(int flag) +{ + u32 eax = (flag >> 5) ? 0x80000001 : 1; + u32 edx; + + asm volatile("cpuid" + : "+a" (eax), "=d" (edx) + : : "ecx", "ebx"); + + return (edx >> (flag & 31)) & 1; +} + +#endif /* ndef __KERNEL__ */ + +#endif +#endif diff --git a/lib/random32.c b/lib/random32.c index 217d5c4b666d..fc3545a32771 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -39,13 +39,16 @@ #include <linux/jiffies.h> #include <linux/random.h> -struct rnd_state { - u32 s1, s2, s3; -}; - static DEFINE_PER_CPU(struct rnd_state, net_rand_state); -static u32 __random32(struct rnd_state *state) +/** + * prandom32 - seeded pseudo-random number generator. + * @state: pointer to state structure holding seeded state. + * + * This is used for pseudo-randomness with no outside seeding. + * For more random results, use random32(). + */ +u32 prandom32(struct rnd_state *state) { #define TAUSWORTHE(s,a,b,c,d) ((s&c)<<d) ^ (((s <<a) ^ s)>>b) @@ -55,14 +58,7 @@ static u32 __random32(struct rnd_state *state) return (state->s1 ^ state->s2 ^ state->s3); } - -/* - * Handle minimum values for seeds - */ -static inline u32 __seed(u32 x, u32 m) -{ - return (x < m) ? x + m : x; -} +EXPORT_SYMBOL(prandom32); /** * random32 - pseudo random number generator @@ -75,7 +71,7 @@ u32 random32(void) { unsigned long r; struct rnd_state *state = &get_cpu_var(net_rand_state); - r = __random32(state); + r = prandom32(state); put_cpu_var(state); return r; } @@ -118,12 +114,12 @@ static int __init random32_init(void) state->s3 = __seed(LCG(state->s2), 15); /* "warm it up" */ - __random32(state); - __random32(state); - __random32(state); - __random32(state); - __random32(state); - __random32(state); + prandom32(state); + prandom32(state); + prandom32(state); + prandom32(state); + prandom32(state); + prandom32(state); } return 0; } @@ -131,7 +127,7 @@ core_initcall(random32_init); /* * Generate better values after random number generator - * is fully initalized. + * is fully initialized. */ static int __init random32_reseed(void) { @@ -147,7 +143,7 @@ static int __init random32_reseed(void) state->s3 = __seed(seeds[2], 15); /* mix it in */ - __random32(state); + prandom32(state); } return 0; } diff --git a/lib/ratelimit.c b/lib/ratelimit.c index 09f5ce1810dc..027a03f4c56d 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -16,9 +16,14 @@ /* * __ratelimit - rate limiting * @rs: ratelimit_state data + * @func: name of calling function * - * This enforces a rate limit: not more than @rs->ratelimit_burst callbacks - * in every @rs->ratelimit_jiffies + * This enforces a rate limit: not more than @rs->burst callbacks + * in every @rs->interval + * + * RETURNS: + * 0 means callbacks will be suppressed. + * 1 means go ahead and do it. */ int ___ratelimit(struct ratelimit_state *rs, const char *func) { @@ -35,7 +40,7 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) * the entity that is holding the lock already: */ if (!spin_trylock_irqsave(&rs->lock, flags)) - return 1; + return 0; if (!rs->begin) rs->begin = jiffies; diff --git a/lib/rbtree.c b/lib/rbtree.c index e2aa3be29858..4693f79195d3 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -283,6 +283,74 @@ void rb_erase(struct rb_node *node, struct rb_root *root) } EXPORT_SYMBOL(rb_erase); +static void rb_augment_path(struct rb_node *node, rb_augment_f func, void *data) +{ + struct rb_node *parent; + +up: + func(node, data); + parent = rb_parent(node); + if (!parent) + return; + + if (node == parent->rb_left && parent->rb_right) + func(parent->rb_right, data); + else if (parent->rb_left) + func(parent->rb_left, data); + + node = parent; + goto up; +} + +/* + * after inserting @node into the tree, update the tree to account for + * both the new entry and any damage done by rebalance + */ +void rb_augment_insert(struct rb_node *node, rb_augment_f func, void *data) +{ + if (node->rb_left) + node = node->rb_left; + else if (node->rb_right) + node = node->rb_right; + + rb_augment_path(node, func, data); +} + +/* + * before removing the node, find the deepest node on the rebalance path + * that will still be there after @node gets removed + */ +struct rb_node *rb_augment_erase_begin(struct rb_node *node) +{ + struct rb_node *deepest; + + if (!node->rb_right && !node->rb_left) + deepest = rb_parent(node); + else if (!node->rb_right) + deepest = node->rb_left; + else if (!node->rb_left) + deepest = node->rb_right; + else { + deepest = rb_next(node); + if (deepest->rb_right) + deepest = deepest->rb_right; + else if (rb_parent(deepest) != node) + deepest = rb_parent(deepest); + } + + return deepest; +} + +/* + * after removal, update the tree to account for the removed entry + * and any rebalance damage. + */ +void rb_augment_erase_end(struct rb_node *node, rb_augment_f func, void *data) +{ + if (node) + rb_augment_path(node, func, data); +} + /* * This function returns the first node (in sort order) of the tree. */ diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c index ccf95bff7984..ffc9fc7f3b05 100644 --- a/lib/rwsem-spinlock.c +++ b/lib/rwsem-spinlock.c @@ -143,13 +143,14 @@ void __sched __down_read(struct rw_semaphore *sem) { struct rwsem_waiter waiter; struct task_struct *tsk; + unsigned long flags; - spin_lock_irq(&sem->wait_lock); + spin_lock_irqsave(&sem->wait_lock, flags); if (sem->activity >= 0 && list_empty(&sem->wait_list)) { /* granted */ sem->activity++; - spin_unlock_irq(&sem->wait_lock); + spin_unlock_irqrestore(&sem->wait_lock, flags); goto out; } @@ -164,7 +165,7 @@ void __sched __down_read(struct rw_semaphore *sem) list_add_tail(&waiter.list, &sem->wait_list); /* we don't need to touch the semaphore struct anymore */ - spin_unlock_irq(&sem->wait_lock); + spin_unlock_irqrestore(&sem->wait_lock, flags); /* wait to be given the lock */ for (;;) { @@ -209,13 +210,14 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) { struct rwsem_waiter waiter; struct task_struct *tsk; + unsigned long flags; - spin_lock_irq(&sem->wait_lock); + spin_lock_irqsave(&sem->wait_lock, flags); if (sem->activity == 0 && list_empty(&sem->wait_list)) { /* granted */ sem->activity = -1; - spin_unlock_irq(&sem->wait_lock); + spin_unlock_irqrestore(&sem->wait_lock, flags); goto out; } @@ -230,7 +232,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) list_add_tail(&waiter.list, &sem->wait_list); /* we don't need to touch the semaphore struct anymore */ - spin_unlock_irq(&sem->wait_lock); + spin_unlock_irqrestore(&sem->wait_lock, flags); /* wait to be given the lock */ for (;;) { diff --git a/lib/rwsem.c b/lib/rwsem.c index 3e3365e5665e..f236d7cd5cf3 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -36,45 +36,56 @@ struct rwsem_waiter { #define RWSEM_WAITING_FOR_WRITE 0x00000002 }; +/* Wake types for __rwsem_do_wake(). Note that RWSEM_WAKE_NO_ACTIVE and + * RWSEM_WAKE_READ_OWNED imply that the spinlock must have been kept held + * since the rwsem value was observed. + */ +#define RWSEM_WAKE_ANY 0 /* Wake whatever's at head of wait list */ +#define RWSEM_WAKE_NO_ACTIVE 1 /* rwsem was observed with no active thread */ +#define RWSEM_WAKE_READ_OWNED 2 /* rwsem was observed to be read owned */ + /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then: * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) - * - there must be someone on the queue + * - there must be someone on the queue * - the spinlock must be held by the caller * - woken process blocks are discarded from the list after having task zeroed * - writers are only woken if downgrading is false */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int downgrading) +static struct rw_semaphore * +__rwsem_do_wake(struct rw_semaphore *sem, int wake_type) { struct rwsem_waiter *waiter; struct task_struct *tsk; struct list_head *next; - signed long oldcount, woken, loop; - - if (downgrading) - goto dont_wake_writers; - - /* if we came through an up_xxxx() call, we only only wake someone up - * if we can transition the active part of the count from 0 -> 1 - */ - try_again: - oldcount = rwsem_atomic_update(RWSEM_ACTIVE_BIAS, sem) - - RWSEM_ACTIVE_BIAS; - if (oldcount & RWSEM_ACTIVE_MASK) - goto undo; + signed long oldcount, woken, loop, adjustment; waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - - /* try to grant a single write lock if there's a writer at the front - * of the queue - note we leave the 'active part' of the count - * incremented by 1 and the waiting part incremented by 0x00010000 - */ if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE)) goto readers_only; + if (wake_type == RWSEM_WAKE_READ_OWNED) + /* Another active reader was observed, so wakeup is not + * likely to succeed. Save the atomic op. + */ + goto out; + + /* There's a writer at the front of the queue - try to grant it the + * write lock. However, we only wake this writer if we can transition + * the active part of the count from 0 -> 1 + */ + adjustment = RWSEM_ACTIVE_WRITE_BIAS; + if (waiter->list.next == &sem->wait_list) + adjustment -= RWSEM_WAITING_BIAS; + + try_again_write: + oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; + if (oldcount & RWSEM_ACTIVE_MASK) + /* Someone grabbed the sem already */ + goto undo_write; + /* We must be careful not to touch 'waiter' after we set ->task = NULL. * It is an allocated on the waiter's stack and may become invalid at * any time after that point (due to a wakeup from another source). @@ -87,18 +98,30 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading) put_task_struct(tsk); goto out; - /* don't want to wake any writers */ - dont_wake_writers: - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - if (waiter->flags & RWSEM_WAITING_FOR_WRITE) + readers_only: + /* If we come here from up_xxxx(), another thread might have reached + * rwsem_down_failed_common() before we acquired the spinlock and + * woken up a waiter, making it now active. We prefer to check for + * this first in order to not spend too much time with the spinlock + * held if we're not going to be able to wake up readers in the end. + * + * Note that we do not need to update the rwsem count: any writer + * trying to acquire rwsem will run rwsem_down_write_failed() due + * to the waiting threads and block trying to acquire the spinlock. + * + * We use a dummy atomic update in order to acquire the cache line + * exclusively since we expect to succeed and run the final rwsem + * count adjustment pretty soon. + */ + if (wake_type == RWSEM_WAKE_ANY && + rwsem_atomic_update(0, sem) < RWSEM_WAITING_BIAS) + /* Someone grabbed the sem for write already */ goto out; - /* grant an infinite number of read locks to the readers at the front - * of the queue - * - note we increment the 'active part' of the count by the number of - * readers before waking any processes up + /* Grant an infinite number of read locks to the readers at the front + * of the queue. Note we increment the 'active part' of the count by + * the number of readers before waking any processes up. */ - readers_only: woken = 0; do { woken++; @@ -111,16 +134,15 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading) } while (waiter->flags & RWSEM_WAITING_FOR_READ); - loop = woken; - woken *= RWSEM_ACTIVE_BIAS - RWSEM_WAITING_BIAS; - if (!downgrading) - /* we'd already done one increment earlier */ - woken -= RWSEM_ACTIVE_BIAS; + adjustment = woken * RWSEM_ACTIVE_READ_BIAS; + if (waiter->flags & RWSEM_WAITING_FOR_READ) + /* hit end of list above */ + adjustment -= RWSEM_WAITING_BIAS; - rwsem_atomic_add(woken, sem); + rwsem_atomic_add(adjustment, sem); next = sem->wait_list.next; - for (; loop > 0; loop--) { + for (loop = woken; loop > 0; loop--) { waiter = list_entry(next, struct rwsem_waiter, list); next = waiter->list.next; tsk = waiter->task; @@ -136,11 +158,12 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading) out: return sem; - /* undo the change to count, but check for a transition 1->0 */ - undo: - if (rwsem_atomic_update(-RWSEM_ACTIVE_BIAS, sem) != 0) + /* undo the change to the active count, but check for a transition + * 1->0 */ + undo_write: + if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK) goto out; - goto try_again; + goto try_again_write; } /* @@ -148,8 +171,9 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading) */ static struct rw_semaphore __sched * rwsem_down_failed_common(struct rw_semaphore *sem, - struct rwsem_waiter *waiter, signed long adjustment) + unsigned int flags, signed long adjustment) { + struct rwsem_waiter waiter; struct task_struct *tsk = current; signed long count; @@ -157,23 +181,34 @@ rwsem_down_failed_common(struct rw_semaphore *sem, /* set up my own style of waitqueue */ spin_lock_irq(&sem->wait_lock); - waiter->task = tsk; + waiter.task = tsk; + waiter.flags = flags; get_task_struct(tsk); - list_add_tail(&waiter->list, &sem->wait_list); + if (list_empty(&sem->wait_list)) + adjustment += RWSEM_WAITING_BIAS; + list_add_tail(&waiter.list, &sem->wait_list); - /* we're now waiting on the lock, but no longer actively read-locking */ + /* we're now waiting on the lock, but no longer actively locking */ count = rwsem_atomic_update(adjustment, sem); - /* if there are no active locks, wake the front queued process(es) up */ - if (!(count & RWSEM_ACTIVE_MASK)) - sem = __rwsem_do_wake(sem, 0); + /* If there are no active locks, wake the front queued process(es) up. + * + * Alternatively, if we're called from a failed down_write(), there + * were already threads queued before us and there are no active + * writers, the lock must be read owned; so we try to wake any read + * locks that were queued ahead of us. */ + if (count == RWSEM_WAITING_BIAS) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE); + else if (count > RWSEM_WAITING_BIAS && + adjustment == -RWSEM_ACTIVE_WRITE_BIAS) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); spin_unlock_irq(&sem->wait_lock); /* wait to be given the lock */ for (;;) { - if (!waiter->task) + if (!waiter.task) break; schedule(); set_task_state(tsk, TASK_UNINTERRUPTIBLE); @@ -190,12 +225,8 @@ rwsem_down_failed_common(struct rw_semaphore *sem, asmregparm struct rw_semaphore __sched * rwsem_down_read_failed(struct rw_semaphore *sem) { - struct rwsem_waiter waiter; - - waiter.flags = RWSEM_WAITING_FOR_READ; - rwsem_down_failed_common(sem, &waiter, - RWSEM_WAITING_BIAS - RWSEM_ACTIVE_BIAS); - return sem; + return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_READ, + -RWSEM_ACTIVE_READ_BIAS); } /* @@ -204,12 +235,8 @@ rwsem_down_read_failed(struct rw_semaphore *sem) asmregparm struct rw_semaphore __sched * rwsem_down_write_failed(struct rw_semaphore *sem) { - struct rwsem_waiter waiter; - - waiter.flags = RWSEM_WAITING_FOR_WRITE; - rwsem_down_failed_common(sem, &waiter, -RWSEM_ACTIVE_BIAS); - - return sem; + return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_WRITE, + -RWSEM_ACTIVE_WRITE_BIAS); } /* @@ -224,7 +251,7 @@ asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 0); + sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -244,7 +271,7 @@ asmregparm struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 1); + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); spin_unlock_irqrestore(&sem->wait_lock, flags); diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 0d475d8167bf..4ceb05d772ae 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -7,8 +7,10 @@ * Version 2. See the file COPYING for more details. */ #include <linux/module.h> +#include <linux/slab.h> #include <linux/scatterlist.h> #include <linux/highmem.h> +#include <linux/kmemleak.h> /** * sg_next - return the next scatterlist entry in a list @@ -114,17 +116,29 @@ EXPORT_SYMBOL(sg_init_one); */ static struct scatterlist *sg_kmalloc(unsigned int nents, gfp_t gfp_mask) { - if (nents == SG_MAX_SINGLE_ALLOC) - return (struct scatterlist *) __get_free_page(gfp_mask); - else + if (nents == SG_MAX_SINGLE_ALLOC) { + /* + * Kmemleak doesn't track page allocations as they are not + * commonly used (in a raw form) for kernel data structures. + * As we chain together a list of pages and then a normal + * kmalloc (tracked by kmemleak), in order to for that last + * allocation not to become decoupled (and thus a + * false-positive) we need to inform kmemleak of all the + * intermediate allocations. + */ + void *ptr = (void *) __get_free_page(gfp_mask); + kmemleak_alloc(ptr, PAGE_SIZE, 1, gfp_mask); + return ptr; + } else return kmalloc(nents * sizeof(struct scatterlist), gfp_mask); } static void sg_kfree(struct scatterlist *sg, unsigned int nents) { - if (nents == SG_MAX_SINGLE_ALLOC) + if (nents == SG_MAX_SINGLE_ALLOC) { + kmemleak_free(sg); free_page((unsigned long) sg); - else + } else kfree(sg); } @@ -234,8 +248,18 @@ int __sg_alloc_table(struct sg_table *table, unsigned int nents, left -= sg_size; sg = alloc_fn(alloc_size, gfp_mask); - if (unlikely(!sg)) - return -ENOMEM; + if (unlikely(!sg)) { + /* + * Adjust entry count to reflect that the last + * entry of the previous table won't be used for + * linkage. Without this, sg_kfree() may get + * confused. + */ + if (prv) + table->nents = ++table->orig_nents; + + return -ENOMEM; + } sg_init_table(sg, alloc_size); table->nents = table->orig_nents += sg_size; diff --git a/lib/show_mem.c b/lib/show_mem.c index 238e72a18ce1..fdc77c82f922 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -15,7 +15,7 @@ void show_mem(void) unsigned long total = 0, reserved = 0, shared = 0, nonshared = 0, highmem = 0; - printk(KERN_INFO "Mem-Info:\n"); + printk("Mem-Info:\n"); show_free_areas(); for_each_online_pgdat(pgdat) { @@ -49,15 +49,15 @@ void show_mem(void) pgdat_resize_unlock(pgdat, &flags); } - printk(KERN_INFO "%lu pages RAM\n", total); + printk("%lu pages RAM\n", total); #ifdef CONFIG_HIGHMEM - printk(KERN_INFO "%lu pages HighMem\n", highmem); + printk("%lu pages HighMem\n", highmem); #endif - printk(KERN_INFO "%lu pages reserved\n", reserved); - printk(KERN_INFO "%lu pages shared\n", shared); - printk(KERN_INFO "%lu pages non-shared\n", nonshared); + printk("%lu pages reserved\n", reserved); + printk("%lu pages shared\n", shared); + printk("%lu pages non-shared\n", nonshared); #ifdef CONFIG_QUICKLIST - printk(KERN_INFO "%lu pages in pagetable cache\n", + printk("%lu pages in pagetable cache\n", quicklist_total_size()); #endif } diff --git a/lib/string.c b/lib/string.c index a1cdcfcc42d0..f71bead1be3e 100644 --- a/lib/string.c +++ b/lib/string.c @@ -36,25 +36,21 @@ int strnicmp(const char *s1, const char *s2, size_t len) /* Yes, Virginia, it had better be unsigned */ unsigned char c1, c2; - c1 = c2 = 0; - if (len) { - do { - c1 = *s1; - c2 = *s2; - s1++; - s2++; - if (!c1) - break; - if (!c2) - break; - if (c1 == c2) - continue; - c1 = tolower(c1); - c2 = tolower(c2); - if (c1 != c2) - break; - } while (--len); - } + if (!len) + return 0; + + do { + c1 = *s1++; + c2 = *s2++; + if (!c1 || !c2) + break; + if (c1 == c2) + continue; + c1 = tolower(c1); + c2 = tolower(c2); + if (c1 != c2) + break; + } while (--len); return (int)c1 - (int)c2; } EXPORT_SYMBOL(strnicmp); @@ -693,13 +689,13 @@ EXPORT_SYMBOL(strstr); */ char *strnstr(const char *s1, const char *s2, size_t len) { - size_t l1 = len, l2; + size_t l2; l2 = strlen(s2); if (!l2) return (char *)s1; - while (l1 >= l2) { - l1--; + while (len >= l2) { + len--; if (!memcmp(s1, s2, l2)) return (char *)s1; s1++; diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 437eedb5a53b..c47bbe11b804 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -28,6 +28,7 @@ #include <linux/types.h> #include <linux/ctype.h> #include <linux/highmem.h> +#include <linux/gfp.h> #include <asm/io.h> #include <asm/dma.h> @@ -49,25 +50,17 @@ */ #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) -/* - * Enumeration for sync targets - */ -enum dma_sync_target { - SYNC_FOR_CPU = 0, - SYNC_FOR_DEVICE = 1, -}; - int swiotlb_force; /* - * Used to do a quick range check in unmap_single and - * sync_single_*, to see if the memory was in fact allocated by this + * Used to do a quick range check in swiotlb_tbl_unmap_single and + * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this * API. */ static char *io_tlb_start, *io_tlb_end; /* - * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and + * The number of IO TLB blocks (in groups of 64) between io_tlb_start and * io_tlb_end. This is command line adjustable via setup_io_tlb_npages. */ static unsigned long io_tlb_nslabs; @@ -77,7 +70,7 @@ static unsigned long io_tlb_nslabs; */ static unsigned long io_tlb_overflow = 32*1024; -void *io_tlb_overflow_buffer; +static void *io_tlb_overflow_buffer; /* * This is a free list describing the number of free entries available from @@ -139,28 +132,14 @@ void swiotlb_print_info(void) (unsigned long long)pend); } -/* - * Statically reserve bounce buffer space and initialize bounce buffer data - * structures for the software IO TLB used to implement the DMA API. - */ -void __init -swiotlb_init_with_default_size(size_t default_size, int verbose) +void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { unsigned long i, bytes; - if (!io_tlb_nslabs) { - io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); - } - - bytes = io_tlb_nslabs << IO_TLB_SHIFT; + bytes = nslabs << IO_TLB_SHIFT; - /* - * Get IO TLB memory from the low pages - */ - io_tlb_start = alloc_bootmem_low_pages(bytes); - if (!io_tlb_start) - panic("Cannot allocate SWIOTLB buffer"); + io_tlb_nslabs = nslabs; + io_tlb_start = tlb; io_tlb_end = io_tlb_start + bytes; /* @@ -168,22 +147,48 @@ swiotlb_init_with_default_size(size_t default_size, int verbose) * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE * between io_tlb_start and io_tlb_end. */ - io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int)); + io_tlb_list = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); for (i = 0; i < io_tlb_nslabs; i++) io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); io_tlb_index = 0; - io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t)); + io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); /* * Get the overflow emergency buffer */ - io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow); + io_tlb_overflow_buffer = alloc_bootmem_low_pages(PAGE_ALIGN(io_tlb_overflow)); if (!io_tlb_overflow_buffer) panic("Cannot allocate SWIOTLB overflow buffer!\n"); if (verbose) swiotlb_print_info(); } +/* + * Statically reserve bounce buffer space and initialize bounce buffer data + * structures for the software IO TLB used to implement the DMA API. + */ +void __init +swiotlb_init_with_default_size(size_t default_size, int verbose) +{ + unsigned long bytes; + + if (!io_tlb_nslabs) { + io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); + io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + } + + bytes = io_tlb_nslabs << IO_TLB_SHIFT; + + /* + * Get IO TLB memory from the low pages + */ + io_tlb_start = alloc_bootmem_low_pages(PAGE_ALIGN(bytes)); + if (!io_tlb_start) + panic("Cannot allocate SWIOTLB buffer"); + + swiotlb_init_with_tbl(io_tlb_start, io_tlb_nslabs, verbose); +} + void __init swiotlb_init(int verbose) { @@ -303,13 +308,13 @@ void __init swiotlb_free(void) get_order(io_tlb_nslabs << IO_TLB_SHIFT)); } else { free_bootmem_late(__pa(io_tlb_overflow_buffer), - io_tlb_overflow); + PAGE_ALIGN(io_tlb_overflow)); free_bootmem_late(__pa(io_tlb_orig_addr), - io_tlb_nslabs * sizeof(phys_addr_t)); + PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); free_bootmem_late(__pa(io_tlb_list), - io_tlb_nslabs * sizeof(int)); + PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); free_bootmem_late(__pa(io_tlb_start), - io_tlb_nslabs << IO_TLB_SHIFT); + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); } } @@ -322,8 +327,8 @@ static int is_swiotlb_buffer(phys_addr_t paddr) /* * Bounce: copy the swiotlb buffer back to the original dma location */ -static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, - enum dma_data_direction dir) +void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, + enum dma_data_direction dir) { unsigned long pfn = PFN_DOWN(phys); @@ -359,26 +364,25 @@ static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, memcpy(phys_to_virt(phys), dma_addr, size); } } +EXPORT_SYMBOL_GPL(swiotlb_bounce); -/* - * Allocates bounce buffer and returns its kernel virtual address. - */ -static void * -map_single(struct device *hwdev, phys_addr_t phys, size_t size, int dir) +void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr, + phys_addr_t phys, size_t size, + enum dma_data_direction dir) { unsigned long flags; char *dma_addr; unsigned int nslots, stride, index, wrap; int i; - unsigned long start_dma_addr; unsigned long mask; unsigned long offset_slots; unsigned long max_slots; mask = dma_get_seg_boundary(hwdev); - start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start) & mask; - offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + tbl_dma_addr &= mask; + + offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; /* * Carefully handle integer overflow which can occur when mask == ~0UL. @@ -465,12 +469,27 @@ found: return dma_addr; } +EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single); + +/* + * Allocates bounce buffer and returns its kernel virtual address. + */ + +static void * +map_single(struct device *hwdev, phys_addr_t phys, size_t size, + enum dma_data_direction dir) +{ + dma_addr_t start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start); + + return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, dir); +} /* * dma_addr is the kernel virtual address of the bounce buffer to unmap. */ -static void -do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) +void +swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size, + enum dma_data_direction dir) { unsigned long flags; int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; @@ -508,10 +527,12 @@ do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) } spin_unlock_irqrestore(&io_tlb_lock, flags); } +EXPORT_SYMBOL_GPL(swiotlb_tbl_unmap_single); -static void -sync_single(struct device *hwdev, char *dma_addr, size_t size, - int dir, int target) +void +swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr, size_t size, + enum dma_data_direction dir, + enum dma_sync_target target) { int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; phys_addr_t phys = io_tlb_orig_addr[index]; @@ -535,6 +556,7 @@ sync_single(struct device *hwdev, char *dma_addr, size_t size, BUG(); } } +EXPORT_SYMBOL_GPL(swiotlb_tbl_sync_single); void * swiotlb_alloc_coherent(struct device *hwdev, size_t size, @@ -558,8 +580,8 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, } if (!ret) { /* - * We are either out of memory or the device can't DMA - * to GFP_DMA memory; fall back on map_single(), which + * We are either out of memory or the device can't DMA to + * GFP_DMA memory; fall back on map_single(), which * will grab memory from the lowest available address range. */ ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE); @@ -577,7 +599,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, (unsigned long long)dev_addr); /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ - do_unmap_single(hwdev, ret, size, DMA_TO_DEVICE); + swiotlb_tbl_unmap_single(hwdev, ret, size, DMA_TO_DEVICE); return NULL; } *dma_handle = dev_addr; @@ -595,13 +617,14 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, if (!is_swiotlb_buffer(paddr)) free_pages((unsigned long)vaddr, get_order(size)); else - /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ - do_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE); + /* DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single */ + swiotlb_tbl_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE); } EXPORT_SYMBOL(swiotlb_free_coherent); static void -swiotlb_full(struct device *dev, size_t size, int dir, int do_panic) +swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir, + int do_panic) { /* * Ran out of IOMMU space for this operation. This is very bad. @@ -679,14 +702,14 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page); * whatever the device wrote there. */ static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, int dir) + size_t size, enum dma_data_direction dir) { phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); if (is_swiotlb_buffer(paddr)) { - do_unmap_single(hwdev, phys_to_virt(paddr), size, dir); + swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir); return; } @@ -722,14 +745,16 @@ EXPORT_SYMBOL_GPL(swiotlb_unmap_page); */ static void swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, int dir, int target) + size_t size, enum dma_data_direction dir, + enum dma_sync_target target) { phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); if (is_swiotlb_buffer(paddr)) { - sync_single(hwdev, phys_to_virt(paddr), size, dir, target); + swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir, + target); return; } @@ -756,37 +781,6 @@ swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, EXPORT_SYMBOL(swiotlb_sync_single_for_device); /* - * Same as above, but for a sub-range of the mapping. - */ -static void -swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr, - unsigned long offset, size_t size, - int dir, int target) -{ - swiotlb_sync_single(hwdev, dev_addr + offset, size, dir, target); -} - -void -swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr, - unsigned long offset, size_t size, - enum dma_data_direction dir) -{ - swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir, - SYNC_FOR_CPU); -} -EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_cpu); - -void -swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr, - unsigned long offset, size_t size, - enum dma_data_direction dir) -{ - swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir, - SYNC_FOR_DEVICE); -} -EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device); - -/* * Map a set of buffers described by scatterlist in streaming mode for DMA. * This is the scatter-gather version of the above swiotlb_map_page * interface. Here the scatter gather list elements are each tagged with the @@ -839,7 +833,7 @@ EXPORT_SYMBOL(swiotlb_map_sg_attrs); int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - int dir) + enum dma_data_direction dir) { return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL); } @@ -866,7 +860,7 @@ EXPORT_SYMBOL(swiotlb_unmap_sg_attrs); void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - int dir) + enum dma_data_direction dir) { return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL); } @@ -881,7 +875,8 @@ EXPORT_SYMBOL(swiotlb_unmap_sg); */ static void swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, - int nelems, int dir, int target) + int nelems, enum dma_data_direction dir, + enum dma_sync_target target) { struct scatterlist *sg; int i; diff --git a/lib/textsearch.c b/lib/textsearch.c index 9fbcb44c554f..d608331b3e47 100644 --- a/lib/textsearch.c +++ b/lib/textsearch.c @@ -103,6 +103,7 @@ #include <linux/rcupdate.h> #include <linux/err.h> #include <linux/textsearch.h> +#include <linux/slab.h> static LIST_HEAD(ts_ops); static DEFINE_SPINLOCK(ts_mod_lock); diff --git a/lib/timerqueue.c b/lib/timerqueue.c new file mode 100644 index 000000000000..e3a1050e6820 --- /dev/null +++ b/lib/timerqueue.c @@ -0,0 +1,107 @@ +/* + * Generic Timer-queue + * + * Manages a simple queue of timers, ordered by expiration time. + * Uses rbtrees for quick list adds and expiration. + * + * NOTE: All of the following functions need to be serialized + * to avoid races. No locking is done by this libary code. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/timerqueue.h> +#include <linux/rbtree.h> +#include <linux/module.h> + +/** + * timerqueue_add - Adds timer to timerqueue. + * + * @head: head of timerqueue + * @node: timer node to be added + * + * Adds the timer node to the timerqueue, sorted by the + * node's expires value. + */ +void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) +{ + struct rb_node **p = &head->head.rb_node; + struct rb_node *parent = NULL; + struct timerqueue_node *ptr; + + /* Make sure we don't add nodes that are already added */ + WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node)); + + while (*p) { + parent = *p; + ptr = rb_entry(parent, struct timerqueue_node, node); + if (node->expires.tv64 < ptr->expires.tv64) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + rb_link_node(&node->node, parent, p); + rb_insert_color(&node->node, &head->head); + + if (!head->next || node->expires.tv64 < head->next->expires.tv64) + head->next = node; +} +EXPORT_SYMBOL_GPL(timerqueue_add); + +/** + * timerqueue_del - Removes a timer from the timerqueue. + * + * @head: head of timerqueue + * @node: timer node to be removed + * + * Removes the timer node from the timerqueue. + */ +void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) +{ + WARN_ON_ONCE(RB_EMPTY_NODE(&node->node)); + + /* update next pointer */ + if (head->next == node) { + struct rb_node *rbn = rb_next(&node->node); + + head->next = rbn ? + rb_entry(rbn, struct timerqueue_node, node) : NULL; + } + rb_erase(&node->node, &head->head); + RB_CLEAR_NODE(&node->node); +} +EXPORT_SYMBOL_GPL(timerqueue_del); + +/** + * timerqueue_iterate_next - Returns the timer after the provided timer + * + * @node: Pointer to a timer. + * + * Provides the timer that is after the given node. This is used, when + * necessary, to iterate through the list of timers in a timer list + * without modifying the list. + */ +struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node) +{ + struct rb_node *next; + + if (!node) + return NULL; + next = rb_next(&node->node); + if (!next) + return NULL; + return container_of(next, struct timerqueue_node, node); +} +EXPORT_SYMBOL_GPL(timerqueue_iterate_next); diff --git a/lib/uuid.c b/lib/uuid.c new file mode 100644 index 000000000000..8fadd7cef46c --- /dev/null +++ b/lib/uuid.c @@ -0,0 +1,53 @@ +/* + * Unified UUID/GUID definition + * + * Copyright (C) 2009, Intel Corp. + * Huang Ying <ying.huang@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation; + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/uuid.h> +#include <linux/random.h> + +static void __uuid_gen_common(__u8 b[16]) +{ + int i; + u32 r; + + for (i = 0; i < 4; i++) { + r = random32(); + memcpy(b + i * 4, &r, 4); + } + /* reversion 0b10 */ + b[8] = (b[8] & 0x3F) | 0x80; +} + +void uuid_le_gen(uuid_le *lu) +{ + __uuid_gen_common(lu->b); + /* version 4 : random generation */ + lu->b[7] = (lu->b[7] & 0x0F) | 0x40; +} +EXPORT_SYMBOL_GPL(uuid_le_gen); + +void uuid_be_gen(uuid_be *bu) +{ + __uuid_gen_common(bu->b); + /* version 4 : random generation */ + bu->b[6] = (bu->b[6] & 0x0F) | 0x40; +} +EXPORT_SYMBOL_GPL(uuid_be_gen); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 3b8aeec4e327..d3023df8477f 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -118,6 +118,7 @@ long long simple_strtoll(const char *cp, char **endp, unsigned int base) return simple_strtoull(cp, endp, base); } +EXPORT_SYMBOL(simple_strtoll); /** * strict_strtoul - convert a string to an unsigned long strictly @@ -145,19 +146,16 @@ int strict_strtoul(const char *cp, unsigned int base, unsigned long *res) { char *tail; unsigned long val; - size_t len; *res = 0; - len = strlen(cp); - if (len == 0) + if (!*cp) return -EINVAL; val = simple_strtoul(cp, &tail, base); if (tail == cp) return -EINVAL; - if ((*tail == '\0') || - ((len == (size_t)(tail - cp) + 1) && (*tail == '\n'))) { + if ((tail[0] == '\0') || (tail[0] == '\n' && tail[1] == '\0')) { *res = val; return 0; } @@ -219,18 +217,15 @@ int strict_strtoull(const char *cp, unsigned int base, unsigned long long *res) { char *tail; unsigned long long val; - size_t len; *res = 0; - len = strlen(cp); - if (len == 0) + if (!*cp) return -EINVAL; val = simple_strtoull(cp, &tail, base); if (tail == cp) return -EINVAL; - if ((*tail == '\0') || - ((len == (size_t)(tail - cp) + 1) && (*tail == '\n'))) { + if ((tail[0] == '\0') || (tail[0] == '\n' && tail[1] == '\0')) { *res = val; return 0; } @@ -266,7 +261,8 @@ int strict_strtoll(const char *cp, unsigned int base, long long *res) } EXPORT_SYMBOL(strict_strtoll); -static int skip_atoi(const char **s) +static noinline_for_stack +int skip_atoi(const char **s) { int i = 0; @@ -286,7 +282,8 @@ static int skip_atoi(const char **s) /* Formats correctly any integer in [0,99999]. * Outputs from one to five digits depending on input. * On i386 gcc 4.1.2 -O2: ~250 bytes of code. */ -static char *put_dec_trunc(char *buf, unsigned q) +static noinline_for_stack +char *put_dec_trunc(char *buf, unsigned q) { unsigned d3, d2, d1, d0; d1 = (q>>4) & 0xf; @@ -323,7 +320,8 @@ static char *put_dec_trunc(char *buf, unsigned q) return buf; } /* Same with if's removed. Always emits five digits */ -static char *put_dec_full(char *buf, unsigned q) +static noinline_for_stack +char *put_dec_full(char *buf, unsigned q) { /* BTW, if q is in [0,9999], 8-bit ints will be enough, */ /* but anyway, gcc produces better code with full-sized ints */ @@ -365,7 +363,8 @@ static char *put_dec_full(char *buf, unsigned q) return buf; } /* No inlining helps gcc to use registers better */ -static noinline char *put_dec(char *buf, unsigned long long num) +static noinline_for_stack +char *put_dec(char *buf, unsigned long long num) { while (1) { unsigned rem; @@ -381,8 +380,8 @@ static noinline char *put_dec(char *buf, unsigned long long num) #define PLUS 4 /* show plus */ #define SPACE 8 /* space if plus */ #define LEFT 16 /* left justified */ -#define SMALL 32 /* Must be 32 == 0x20 */ -#define SPECIAL 64 /* 0x */ +#define SMALL 32 /* use lowercase in hex (must be 32 == 0x20) */ +#define SPECIAL 64 /* prefix hex with "0x", octal with "0" */ enum format_type { FORMAT_TYPE_NONE, /* Just a string part */ @@ -408,16 +407,17 @@ enum format_type { }; struct printf_spec { - enum format_type type; - int flags; /* flags to number() */ - int field_width; /* width of output field */ - int base; - int precision; /* # of digits/chars */ - int qualifier; + u8 type; /* format_type enum */ + u8 flags; /* flags to number() */ + u8 base; /* number base, 8, 10 or 16 only */ + u8 qualifier; /* number qualifier, one of 'hHlLtzZ' */ + s16 field_width; /* width of output field */ + s16 precision; /* # of digits/chars */ }; -static char *number(char *buf, char *end, unsigned long long num, - struct printf_spec spec) +static noinline_for_stack +char *number(char *buf, char *end, unsigned long long num, + struct printf_spec spec) { /* we are called with base 8, 10 or 16, only, thus don't need "G..." */ static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */ @@ -536,7 +536,8 @@ static char *number(char *buf, char *end, unsigned long long num, return buf; } -static char *string(char *buf, char *end, const char *s, struct printf_spec spec) +static noinline_for_stack +char *string(char *buf, char *end, const char *s, struct printf_spec spec) { int len, i; @@ -566,8 +567,9 @@ static char *string(char *buf, char *end, const char *s, struct printf_spec spec return buf; } -static char *symbol_string(char *buf, char *end, void *ptr, - struct printf_spec spec, char ext) +static noinline_for_stack +char *symbol_string(char *buf, char *end, void *ptr, + struct printf_spec spec, char ext) { unsigned long value = (unsigned long) ptr; #ifdef CONFIG_KALLSYMS @@ -587,8 +589,9 @@ static char *symbol_string(char *buf, char *end, void *ptr, #endif } -static char *resource_string(char *buf, char *end, struct resource *res, - struct printf_spec spec, const char *fmt) +static noinline_for_stack +char *resource_string(char *buf, char *end, struct resource *res, + struct printf_spec spec, const char *fmt) { #ifndef IO_RSRC_PRINTK_SIZE #define IO_RSRC_PRINTK_SIZE 6 @@ -597,22 +600,35 @@ static char *resource_string(char *buf, char *end, struct resource *res, #ifndef MEM_RSRC_PRINTK_SIZE #define MEM_RSRC_PRINTK_SIZE 10 #endif - struct printf_spec hex_spec = { + static const struct printf_spec io_spec = { .base = 16, + .field_width = IO_RSRC_PRINTK_SIZE, .precision = -1, .flags = SPECIAL | SMALL | ZEROPAD, }; - struct printf_spec dec_spec = { + static const struct printf_spec mem_spec = { + .base = 16, + .field_width = MEM_RSRC_PRINTK_SIZE, + .precision = -1, + .flags = SPECIAL | SMALL | ZEROPAD, + }; + static const struct printf_spec bus_spec = { + .base = 16, + .field_width = 2, + .precision = -1, + .flags = SMALL | ZEROPAD, + }; + static const struct printf_spec dec_spec = { .base = 10, .precision = -1, .flags = 0, }; - struct printf_spec str_spec = { + static const struct printf_spec str_spec = { .field_width = -1, .precision = 10, .flags = LEFT, }; - struct printf_spec flag_spec = { + static const struct printf_spec flag_spec = { .base = 16, .precision = -1, .flags = SPECIAL | SMALL, @@ -622,47 +638,48 @@ static char *resource_string(char *buf, char *end, struct resource *res, * 64-bit res (sizeof==8): 20 chars in dec, 18 in hex ("0x" + 16) */ #define RSRC_BUF_SIZE ((2 * sizeof(resource_size_t)) + 4) #define FLAG_BUF_SIZE (2 * sizeof(res->flags)) -#define DECODED_BUF_SIZE sizeof("[mem - 64bit pref disabled]") +#define DECODED_BUF_SIZE sizeof("[mem - 64bit pref window disabled]") #define RAW_BUF_SIZE sizeof("[mem - flags 0x]") char sym[max(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE, 2*RSRC_BUF_SIZE + FLAG_BUF_SIZE + RAW_BUF_SIZE)]; char *p = sym, *pend = sym + sizeof(sym); - int size = -1, addr = 0; int decode = (fmt[0] == 'R') ? 1 : 0; - - if (res->flags & IORESOURCE_IO) { - size = IO_RSRC_PRINTK_SIZE; - addr = 1; - } else if (res->flags & IORESOURCE_MEM) { - size = MEM_RSRC_PRINTK_SIZE; - addr = 1; - } + const struct printf_spec *specp; *p++ = '['; - if (res->flags & IORESOURCE_IO) + if (res->flags & IORESOURCE_IO) { p = string(p, pend, "io ", str_spec); - else if (res->flags & IORESOURCE_MEM) + specp = &io_spec; + } else if (res->flags & IORESOURCE_MEM) { p = string(p, pend, "mem ", str_spec); - else if (res->flags & IORESOURCE_IRQ) + specp = &mem_spec; + } else if (res->flags & IORESOURCE_IRQ) { p = string(p, pend, "irq ", str_spec); - else if (res->flags & IORESOURCE_DMA) + specp = &dec_spec; + } else if (res->flags & IORESOURCE_DMA) { p = string(p, pend, "dma ", str_spec); - else { + specp = &dec_spec; + } else if (res->flags & IORESOURCE_BUS) { + p = string(p, pend, "bus ", str_spec); + specp = &bus_spec; + } else { p = string(p, pend, "??? ", str_spec); + specp = &mem_spec; decode = 0; } - hex_spec.field_width = size; - p = number(p, pend, res->start, addr ? hex_spec : dec_spec); + p = number(p, pend, res->start, *specp); if (res->start != res->end) { *p++ = '-'; - p = number(p, pend, res->end, addr ? hex_spec : dec_spec); + p = number(p, pend, res->end, *specp); } if (decode) { if (res->flags & IORESOURCE_MEM_64) p = string(p, pend, " 64bit", str_spec); if (res->flags & IORESOURCE_PREFETCH) p = string(p, pend, " pref", str_spec); + if (res->flags & IORESOURCE_WINDOW) + p = string(p, pend, " window", str_spec); if (res->flags & IORESOURCE_DISABLED) p = string(p, pend, " disabled", str_spec); } else { @@ -675,30 +692,63 @@ static char *resource_string(char *buf, char *end, struct resource *res, return string(buf, end, sym, spec); } -static char *mac_address_string(char *buf, char *end, u8 *addr, - struct printf_spec spec, const char *fmt) +static noinline_for_stack +char *mac_address_string(char *buf, char *end, u8 *addr, + struct printf_spec spec, const char *fmt) { char mac_addr[sizeof("xx:xx:xx:xx:xx:xx")]; char *p = mac_addr; int i; + char separator; + + if (fmt[1] == 'F') { /* FDDI canonical format */ + separator = '-'; + } else { + separator = ':'; + } for (i = 0; i < 6; i++) { p = pack_hex_byte(p, addr[i]); if (fmt[0] == 'M' && i != 5) - *p++ = ':'; + *p++ = separator; } *p = '\0'; return string(buf, end, mac_addr, spec); } -static char *ip4_string(char *p, const u8 *addr, bool leading_zeros) +static noinline_for_stack +char *ip4_string(char *p, const u8 *addr, const char *fmt) { int i; - + bool leading_zeros = (fmt[0] == 'i'); + int index; + int step; + + switch (fmt[2]) { + case 'h': +#ifdef __BIG_ENDIAN + index = 0; + step = 1; +#else + index = 3; + step = -1; +#endif + break; + case 'l': + index = 3; + step = -1; + break; + case 'n': + case 'b': + default: + index = 0; + step = 1; + break; + } for (i = 0; i < 4; i++) { char temp[3]; /* hold each IP quad in reverse order */ - int digits = put_dec_trunc(temp, addr[i]) - temp; + int digits = put_dec_trunc(temp, addr[index]) - temp; if (leading_zeros) { if (digits < 3) *p++ = '0'; @@ -710,13 +760,15 @@ static char *ip4_string(char *p, const u8 *addr, bool leading_zeros) *p++ = temp[digits]; if (i < 3) *p++ = '.'; + index += step; } *p = '\0'; return p; } -static char *ip6_compressed_string(char *p, const char *addr) +static noinline_for_stack +char *ip6_compressed_string(char *p, const char *addr) { int i, j, range; unsigned char zerolength[8]; @@ -789,14 +841,15 @@ static char *ip6_compressed_string(char *p, const char *addr) if (useIPv4) { if (needcolon) *p++ = ':'; - p = ip4_string(p, &in6.s6_addr[12], false); + p = ip4_string(p, &in6.s6_addr[12], "I4"); } *p = '\0'; return p; } -static char *ip6_string(char *p, const char *addr, const char *fmt) +static noinline_for_stack +char *ip6_string(char *p, const char *addr, const char *fmt) { int i; @@ -811,8 +864,9 @@ static char *ip6_string(char *p, const char *addr, const char *fmt) return p; } -static char *ip6_addr_string(char *buf, char *end, const u8 *addr, - struct printf_spec spec, const char *fmt) +static noinline_for_stack +char *ip6_addr_string(char *buf, char *end, const u8 *addr, + struct printf_spec spec, const char *fmt) { char ip6_addr[sizeof("xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255")]; @@ -824,18 +878,20 @@ static char *ip6_addr_string(char *buf, char *end, const u8 *addr, return string(buf, end, ip6_addr, spec); } -static char *ip4_addr_string(char *buf, char *end, const u8 *addr, - struct printf_spec spec, const char *fmt) +static noinline_for_stack +char *ip4_addr_string(char *buf, char *end, const u8 *addr, + struct printf_spec spec, const char *fmt) { char ip4_addr[sizeof("255.255.255.255")]; - ip4_string(ip4_addr, addr, fmt[0] == 'i'); + ip4_string(ip4_addr, addr, fmt); return string(buf, end, ip4_addr, spec); } -static char *uuid_string(char *buf, char *end, const u8 *addr, - struct printf_spec spec, const char *fmt) +static noinline_for_stack +char *uuid_string(char *buf, char *end, const u8 *addr, + struct printf_spec spec, const char *fmt) { char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; char *p = uuid; @@ -880,6 +936,8 @@ static char *uuid_string(char *buf, char *end, const u8 *addr, return string(buf, end, uuid, spec); } +int kptr_restrict = 1; + /* * Show a '%p' thing. A kernel extension is that the '%p' is followed * by an extra set of alphanumeric characters that are extended format @@ -896,12 +954,15 @@ static char *uuid_string(char *buf, char *end, const u8 *addr, * - 'M' For a 6-byte MAC address, it prints the address in the * usual colon-separated hex notation * - 'm' For a 6-byte MAC address, it prints the hex address without colons + * - 'MF' For a 6-byte MAC FDDI address, it prints the address + * with a dash-separated hex notation * - 'I' [46] for IPv4/IPv6 addresses printed in the usual way * IPv4 uses dot-separated decimal without leading 0's (1.2.3.4) * IPv6 uses colon separated network-order 16 bit hex with leading 0's * - 'i' [46] for 'raw' IPv4/IPv6 addresses * IPv6 omits the colons (01020304...0f) * IPv4 uses dot-separated decimal with leading 0's (010.123.045.006) + * - '[Ii]4[hnbl]' IPv4 addresses in host, network, big or little endian order * - 'I6c' for IPv6 addresses printed as specified by * http://tools.ietf.org/html/draft-ietf-6man-text-addr-representation-00 * - 'U' For a 16 byte UUID/GUID, it prints the UUID/GUID in the form @@ -915,16 +976,30 @@ static char *uuid_string(char *buf, char *end, const u8 *addr, * [0][1][2][3]-[4][5]-[6][7]-[8][9]-[10][11][12][13][14][15] * little endian output byte order is: * [3][2][1][0]-[5][4]-[7][6]-[8][9]-[10][11][12][13][14][15] + * - 'V' For a struct va_format which contains a format string * and va_list *, + * call vsnprintf(->format, *->va_list). + * Implements a "recursive vsnprintf". + * Do not use this feature without some mechanism to verify the + * correctness of the format string and va_list arguments. + * - 'K' For a kernel pointer that should be hidden from unprivileged users * * Note: The difference between 'S' and 'F' is that on ia64 and ppc64 * function pointers are really function descriptors, which contain a * pointer to the real address. */ -static char *pointer(const char *fmt, char *buf, char *end, void *ptr, - struct printf_spec spec) +static noinline_for_stack +char *pointer(const char *fmt, char *buf, char *end, void *ptr, + struct printf_spec spec) { - if (!ptr) + if (!ptr) { + /* + * Print (null) with the same width as a pointer so it makes + * tabular output look nice. + */ + if (spec.field_width == -1) + spec.field_width = 2 * sizeof(void *); return string(buf, end, "(null)", spec); + } switch (*fmt) { case 'F': @@ -939,6 +1014,7 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, return resource_string(buf, end, ptr, spec, fmt); case 'M': /* Colon separated: 00:01:02:03:04:05 */ case 'm': /* Contiguous: 000102030405 */ + /* [mM]F (FDDI, bit reversed) */ return mac_address_string(buf, end, ptr, spec, fmt); case 'I': /* Formatted IP supported * 4: 1.2.3.4 @@ -958,10 +1034,33 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, break; case 'U': return uuid_string(buf, end, ptr, spec, fmt); + case 'V': + return buf + vsnprintf(buf, end - buf, + ((struct va_format *)ptr)->fmt, + *(((struct va_format *)ptr)->va)); + case 'K': + /* + * %pK cannot be used in IRQ context because its test + * for CAP_SYSLOG would be meaningless. + */ + if (in_irq() || in_serving_softirq() || in_nmi()) { + if (spec.field_width == -1) + spec.field_width = 2 * sizeof(void *); + return string(buf, end, "pK-error", spec); + } else if ((kptr_restrict == 0) || + (kptr_restrict == 1 && + has_capability_noaudit(current, CAP_SYSLOG))) + break; + + if (spec.field_width == -1) { + spec.field_width = 2 * sizeof(void *); + spec.flags |= ZEROPAD; + } + return number(buf, end, 0, spec); } spec.flags |= SMALL; if (spec.field_width == -1) { - spec.field_width = 2*sizeof(void *); + spec.field_width = 2 * sizeof(void *); spec.flags |= ZEROPAD; } spec.base = 16; @@ -989,7 +1088,8 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, * @precision: precision of a number * @qualifier: qualifier of a number (long, size_t, ...) */ -static int format_decode(const char *fmt, struct printf_spec *spec) +static noinline_for_stack +int format_decode(const char *fmt, struct printf_spec *spec) { const char *start = fmt; @@ -1297,7 +1397,7 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) break; case FORMAT_TYPE_NRCHARS: { - int qualifier = spec.qualifier; + u8 qualifier = spec.qualifier; if (qualifier == 'l') { long *ip = va_arg(args, long *); @@ -1373,7 +1473,7 @@ EXPORT_SYMBOL(vsnprintf); * @args: Arguments for the format string * * The return value is the number of characters which have been written into - * the @buf not including the trailing '\0'. If @size is <= 0 the function + * the @buf not including the trailing '\0'. If @size is == 0 the function * returns 0. * * Call this function if you are already dealing with a va_list. @@ -1387,7 +1487,11 @@ int vscnprintf(char *buf, size_t size, const char *fmt, va_list args) i = vsnprintf(buf, size, fmt, args); - return (i >= size) ? (size - 1) : i; + if (likely(i < size)) + return i; + if (size != 0) + return size - 1; + return 0; } EXPORT_SYMBOL(vscnprintf); @@ -1426,7 +1530,7 @@ EXPORT_SYMBOL(snprintf); * @...: Arguments for the format string * * The return value is the number of characters written into @buf not including - * the trailing '\0'. If @size is <= 0 the function returns 0. + * the trailing '\0'. If @size is == 0 the function returns 0. */ int scnprintf(char *buf, size_t size, const char *fmt, ...) @@ -1435,10 +1539,10 @@ int scnprintf(char *buf, size_t size, const char *fmt, ...) int i; va_start(args, fmt); - i = vsnprintf(buf, size, fmt, args); + i = vscnprintf(buf, size, fmt, args); va_end(args); - return (i >= size) ? (size - 1) : i; + return i; } EXPORT_SYMBOL(scnprintf); @@ -1583,7 +1687,7 @@ do { \ case FORMAT_TYPE_NRCHARS: { /* skip %n 's argument */ - int qualifier = spec.qualifier; + u8 qualifier = spec.qualifier; void *skip_arg; if (qualifier == 'l') skip_arg = va_arg(args, long *); @@ -1849,7 +1953,9 @@ int vsscanf(const char *buf, const char *fmt, va_list args) char *next; char digit; int num = 0; - int qualifier, base, field_width; + u8 qualifier; + u8 base; + s16 field_width; bool is_sign; while (*fmt && *str) { @@ -1927,7 +2033,7 @@ int vsscanf(const char *buf, const char *fmt, va_list args) { char *s = (char *)va_arg(args, char *); if (field_width == -1) - field_width = INT_MAX; + field_width = SHRT_MAX; /* first, skip leading white space in buffer */ str = skip_spaces(str); diff --git a/lib/xz/Kconfig b/lib/xz/Kconfig new file mode 100644 index 000000000000..e3b6e18fdac5 --- /dev/null +++ b/lib/xz/Kconfig @@ -0,0 +1,59 @@ +config XZ_DEC + tristate "XZ decompression support" + select CRC32 + help + LZMA2 compression algorithm and BCJ filters are supported using + the .xz file format as the container. For integrity checking, + CRC32 is supported. See Documentation/xz.txt for more information. + +config XZ_DEC_X86 + bool "x86 BCJ filter decoder" if EMBEDDED + default y + depends on XZ_DEC + select XZ_DEC_BCJ + +config XZ_DEC_POWERPC + bool "PowerPC BCJ filter decoder" if EMBEDDED + default y + depends on XZ_DEC + select XZ_DEC_BCJ + +config XZ_DEC_IA64 + bool "IA-64 BCJ filter decoder" if EMBEDDED + default y + depends on XZ_DEC + select XZ_DEC_BCJ + +config XZ_DEC_ARM + bool "ARM BCJ filter decoder" if EMBEDDED + default y + depends on XZ_DEC + select XZ_DEC_BCJ + +config XZ_DEC_ARMTHUMB + bool "ARM-Thumb BCJ filter decoder" if EMBEDDED + default y + depends on XZ_DEC + select XZ_DEC_BCJ + +config XZ_DEC_SPARC + bool "SPARC BCJ filter decoder" if EMBEDDED + default y + depends on XZ_DEC + select XZ_DEC_BCJ + +config XZ_DEC_BCJ + bool + default n + +config XZ_DEC_TEST + tristate "XZ decompressor tester" + default n + depends on XZ_DEC + help + This allows passing .xz files to the in-kernel XZ decoder via + a character special file. It calculates CRC32 of the decompressed + data and writes diagnostics to the system log. + + Unless you are developing the XZ decoder, you don't need this + and should say N. diff --git a/lib/xz/Makefile b/lib/xz/Makefile new file mode 100644 index 000000000000..a7fa7693f0f3 --- /dev/null +++ b/lib/xz/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_XZ_DEC) += xz_dec.o +xz_dec-y := xz_dec_syms.o xz_dec_stream.o xz_dec_lzma2.o +xz_dec-$(CONFIG_XZ_DEC_BCJ) += xz_dec_bcj.o + +obj-$(CONFIG_XZ_DEC_TEST) += xz_dec_test.o diff --git a/lib/xz/xz_crc32.c b/lib/xz/xz_crc32.c new file mode 100644 index 000000000000..34532d14fd4c --- /dev/null +++ b/lib/xz/xz_crc32.c @@ -0,0 +1,59 @@ +/* + * CRC32 using the polynomial from IEEE-802.3 + * + * Authors: Lasse Collin <lasse.collin@tukaani.org> + * Igor Pavlov <http://7-zip.org/> + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +/* + * This is not the fastest implementation, but it is pretty compact. + * The fastest versions of xz_crc32() on modern CPUs without hardware + * accelerated CRC instruction are 3-5 times as fast as this version, + * but they are bigger and use more memory for the lookup table. + */ + +#include "xz_private.h" + +/* + * STATIC_RW_DATA is used in the pre-boot environment on some architectures. + * See <linux/decompress/mm.h> for details. + */ +#ifndef STATIC_RW_DATA +# define STATIC_RW_DATA static +#endif + +STATIC_RW_DATA uint32_t xz_crc32_table[256]; + +XZ_EXTERN void xz_crc32_init(void) +{ + const uint32_t poly = 0xEDB88320; + + uint32_t i; + uint32_t j; + uint32_t r; + + for (i = 0; i < 256; ++i) { + r = i; + for (j = 0; j < 8; ++j) + r = (r >> 1) ^ (poly & ~((r & 1) - 1)); + + xz_crc32_table[i] = r; + } + + return; +} + +XZ_EXTERN uint32_t xz_crc32(const uint8_t *buf, size_t size, uint32_t crc) +{ + crc = ~crc; + + while (size != 0) { + crc = xz_crc32_table[*buf++ ^ (crc & 0xFF)] ^ (crc >> 8); + --size; + } + + return ~crc; +} diff --git a/lib/xz/xz_dec_bcj.c b/lib/xz/xz_dec_bcj.c new file mode 100644 index 000000000000..e51e2558ca9d --- /dev/null +++ b/lib/xz/xz_dec_bcj.c @@ -0,0 +1,561 @@ +/* + * Branch/Call/Jump (BCJ) filter decoders + * + * Authors: Lasse Collin <lasse.collin@tukaani.org> + * Igor Pavlov <http://7-zip.org/> + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#include "xz_private.h" + +/* + * The rest of the file is inside this ifdef. It makes things a little more + * convenient when building without support for any BCJ filters. + */ +#ifdef XZ_DEC_BCJ + +struct xz_dec_bcj { + /* Type of the BCJ filter being used */ + enum { + BCJ_X86 = 4, /* x86 or x86-64 */ + BCJ_POWERPC = 5, /* Big endian only */ + BCJ_IA64 = 6, /* Big or little endian */ + BCJ_ARM = 7, /* Little endian only */ + BCJ_ARMTHUMB = 8, /* Little endian only */ + BCJ_SPARC = 9 /* Big or little endian */ + } type; + + /* + * Return value of the next filter in the chain. We need to preserve + * this information across calls, because we must not call the next + * filter anymore once it has returned XZ_STREAM_END. + */ + enum xz_ret ret; + + /* True if we are operating in single-call mode. */ + bool single_call; + + /* + * Absolute position relative to the beginning of the uncompressed + * data (in a single .xz Block). We care only about the lowest 32 + * bits so this doesn't need to be uint64_t even with big files. + */ + uint32_t pos; + + /* x86 filter state */ + uint32_t x86_prev_mask; + + /* Temporary space to hold the variables from struct xz_buf */ + uint8_t *out; + size_t out_pos; + size_t out_size; + + struct { + /* Amount of already filtered data in the beginning of buf */ + size_t filtered; + + /* Total amount of data currently stored in buf */ + size_t size; + + /* + * Buffer to hold a mix of filtered and unfiltered data. This + * needs to be big enough to hold Alignment + 2 * Look-ahead: + * + * Type Alignment Look-ahead + * x86 1 4 + * PowerPC 4 0 + * IA-64 16 0 + * ARM 4 0 + * ARM-Thumb 2 2 + * SPARC 4 0 + */ + uint8_t buf[16]; + } temp; +}; + +#ifdef XZ_DEC_X86 +/* + * This is used to test the most significant byte of a memory address + * in an x86 instruction. + */ +static inline int bcj_x86_test_msbyte(uint8_t b) +{ + return b == 0x00 || b == 0xFF; +} + +static size_t bcj_x86(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + static const bool mask_to_allowed_status[8] + = { true, true, true, false, true, false, false, false }; + + static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 }; + + size_t i; + size_t prev_pos = (size_t)-1; + uint32_t prev_mask = s->x86_prev_mask; + uint32_t src; + uint32_t dest; + uint32_t j; + uint8_t b; + + if (size <= 4) + return 0; + + size -= 4; + for (i = 0; i < size; ++i) { + if ((buf[i] & 0xFE) != 0xE8) + continue; + + prev_pos = i - prev_pos; + if (prev_pos > 3) { + prev_mask = 0; + } else { + prev_mask = (prev_mask << (prev_pos - 1)) & 7; + if (prev_mask != 0) { + b = buf[i + 4 - mask_to_bit_num[prev_mask]]; + if (!mask_to_allowed_status[prev_mask] + || bcj_x86_test_msbyte(b)) { + prev_pos = i; + prev_mask = (prev_mask << 1) | 1; + continue; + } + } + } + + prev_pos = i; + + if (bcj_x86_test_msbyte(buf[i + 4])) { + src = get_unaligned_le32(buf + i + 1); + while (true) { + dest = src - (s->pos + (uint32_t)i + 5); + if (prev_mask == 0) + break; + + j = mask_to_bit_num[prev_mask] * 8; + b = (uint8_t)(dest >> (24 - j)); + if (!bcj_x86_test_msbyte(b)) + break; + + src = dest ^ (((uint32_t)1 << (32 - j)) - 1); + } + + dest &= 0x01FFFFFF; + dest |= (uint32_t)0 - (dest & 0x01000000); + put_unaligned_le32(dest, buf + i + 1); + i += 4; + } else { + prev_mask = (prev_mask << 1) | 1; + } + } + + prev_pos = i - prev_pos; + s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1); + return i; +} +#endif + +#ifdef XZ_DEC_POWERPC +static size_t bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + size_t i; + uint32_t instr; + + for (i = 0; i + 4 <= size; i += 4) { + instr = get_unaligned_be32(buf + i); + if ((instr & 0xFC000003) == 0x48000001) { + instr &= 0x03FFFFFC; + instr -= s->pos + (uint32_t)i; + instr &= 0x03FFFFFC; + instr |= 0x48000001; + put_unaligned_be32(instr, buf + i); + } + } + + return i; +} +#endif + +#ifdef XZ_DEC_IA64 +static size_t bcj_ia64(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + static const uint8_t branch_table[32] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 4, 4, 6, 6, 0, 0, 7, 7, + 4, 4, 0, 0, 4, 4, 0, 0 + }; + + /* + * The local variables take a little bit stack space, but it's less + * than what LZMA2 decoder takes, so it doesn't make sense to reduce + * stack usage here without doing that for the LZMA2 decoder too. + */ + + /* Loop counters */ + size_t i; + size_t j; + + /* Instruction slot (0, 1, or 2) in the 128-bit instruction word */ + uint32_t slot; + + /* Bitwise offset of the instruction indicated by slot */ + uint32_t bit_pos; + + /* bit_pos split into byte and bit parts */ + uint32_t byte_pos; + uint32_t bit_res; + + /* Address part of an instruction */ + uint32_t addr; + + /* Mask used to detect which instructions to convert */ + uint32_t mask; + + /* 41-bit instruction stored somewhere in the lowest 48 bits */ + uint64_t instr; + + /* Instruction normalized with bit_res for easier manipulation */ + uint64_t norm; + + for (i = 0; i + 16 <= size; i += 16) { + mask = branch_table[buf[i] & 0x1F]; + for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) { + if (((mask >> slot) & 1) == 0) + continue; + + byte_pos = bit_pos >> 3; + bit_res = bit_pos & 7; + instr = 0; + for (j = 0; j < 6; ++j) + instr |= (uint64_t)(buf[i + j + byte_pos]) + << (8 * j); + + norm = instr >> bit_res; + + if (((norm >> 37) & 0x0F) == 0x05 + && ((norm >> 9) & 0x07) == 0) { + addr = (norm >> 13) & 0x0FFFFF; + addr |= ((uint32_t)(norm >> 36) & 1) << 20; + addr <<= 4; + addr -= s->pos + (uint32_t)i; + addr >>= 4; + + norm &= ~((uint64_t)0x8FFFFF << 13); + norm |= (uint64_t)(addr & 0x0FFFFF) << 13; + norm |= (uint64_t)(addr & 0x100000) + << (36 - 20); + + instr &= (1 << bit_res) - 1; + instr |= norm << bit_res; + + for (j = 0; j < 6; j++) + buf[i + j + byte_pos] + = (uint8_t)(instr >> (8 * j)); + } + } + } + + return i; +} +#endif + +#ifdef XZ_DEC_ARM +static size_t bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + size_t i; + uint32_t addr; + + for (i = 0; i + 4 <= size; i += 4) { + if (buf[i + 3] == 0xEB) { + addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8) + | ((uint32_t)buf[i + 2] << 16); + addr <<= 2; + addr -= s->pos + (uint32_t)i + 8; + addr >>= 2; + buf[i] = (uint8_t)addr; + buf[i + 1] = (uint8_t)(addr >> 8); + buf[i + 2] = (uint8_t)(addr >> 16); + } + } + + return i; +} +#endif + +#ifdef XZ_DEC_ARMTHUMB +static size_t bcj_armthumb(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + size_t i; + uint32_t addr; + + for (i = 0; i + 4 <= size; i += 2) { + if ((buf[i + 1] & 0xF8) == 0xF0 + && (buf[i + 3] & 0xF8) == 0xF8) { + addr = (((uint32_t)buf[i + 1] & 0x07) << 19) + | ((uint32_t)buf[i] << 11) + | (((uint32_t)buf[i + 3] & 0x07) << 8) + | (uint32_t)buf[i + 2]; + addr <<= 1; + addr -= s->pos + (uint32_t)i + 4; + addr >>= 1; + buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07)); + buf[i] = (uint8_t)(addr >> 11); + buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07)); + buf[i + 2] = (uint8_t)addr; + i += 2; + } + } + + return i; +} +#endif + +#ifdef XZ_DEC_SPARC +static size_t bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + size_t i; + uint32_t instr; + + for (i = 0; i + 4 <= size; i += 4) { + instr = get_unaligned_be32(buf + i); + if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) { + instr <<= 2; + instr -= s->pos + (uint32_t)i; + instr >>= 2; + instr = ((uint32_t)0x40000000 - (instr & 0x400000)) + | 0x40000000 | (instr & 0x3FFFFF); + put_unaligned_be32(instr, buf + i); + } + } + + return i; +} +#endif + +/* + * Apply the selected BCJ filter. Update *pos and s->pos to match the amount + * of data that got filtered. + * + * NOTE: This is implemented as a switch statement to avoid using function + * pointers, which could be problematic in the kernel boot code, which must + * avoid pointers to static data (at least on x86). + */ +static void bcj_apply(struct xz_dec_bcj *s, + uint8_t *buf, size_t *pos, size_t size) +{ + size_t filtered; + + buf += *pos; + size -= *pos; + + switch (s->type) { +#ifdef XZ_DEC_X86 + case BCJ_X86: + filtered = bcj_x86(s, buf, size); + break; +#endif +#ifdef XZ_DEC_POWERPC + case BCJ_POWERPC: + filtered = bcj_powerpc(s, buf, size); + break; +#endif +#ifdef XZ_DEC_IA64 + case BCJ_IA64: + filtered = bcj_ia64(s, buf, size); + break; +#endif +#ifdef XZ_DEC_ARM + case BCJ_ARM: + filtered = bcj_arm(s, buf, size); + break; +#endif +#ifdef XZ_DEC_ARMTHUMB + case BCJ_ARMTHUMB: + filtered = bcj_armthumb(s, buf, size); + break; +#endif +#ifdef XZ_DEC_SPARC + case BCJ_SPARC: + filtered = bcj_sparc(s, buf, size); + break; +#endif + default: + /* Never reached but silence compiler warnings. */ + filtered = 0; + break; + } + + *pos += filtered; + s->pos += filtered; +} + +/* + * Flush pending filtered data from temp to the output buffer. + * Move the remaining mixture of possibly filtered and unfiltered + * data to the beginning of temp. + */ +static void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b) +{ + size_t copy_size; + + copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos); + memcpy(b->out + b->out_pos, s->temp.buf, copy_size); + b->out_pos += copy_size; + + s->temp.filtered -= copy_size; + s->temp.size -= copy_size; + memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size); +} + +/* + * The BCJ filter functions are primitive in sense that they process the + * data in chunks of 1-16 bytes. To hide this issue, this function does + * some buffering. + */ +XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s, + struct xz_dec_lzma2 *lzma2, + struct xz_buf *b) +{ + size_t out_start; + + /* + * Flush pending already filtered data to the output buffer. Return + * immediatelly if we couldn't flush everything, or if the next + * filter in the chain had already returned XZ_STREAM_END. + */ + if (s->temp.filtered > 0) { + bcj_flush(s, b); + if (s->temp.filtered > 0) + return XZ_OK; + + if (s->ret == XZ_STREAM_END) + return XZ_STREAM_END; + } + + /* + * If we have more output space than what is currently pending in + * temp, copy the unfiltered data from temp to the output buffer + * and try to fill the output buffer by decoding more data from the + * next filter in the chain. Apply the BCJ filter on the new data + * in the output buffer. If everything cannot be filtered, copy it + * to temp and rewind the output buffer position accordingly. + */ + if (s->temp.size < b->out_size - b->out_pos) { + out_start = b->out_pos; + memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size); + b->out_pos += s->temp.size; + + s->ret = xz_dec_lzma2_run(lzma2, b); + if (s->ret != XZ_STREAM_END + && (s->ret != XZ_OK || s->single_call)) + return s->ret; + + bcj_apply(s, b->out, &out_start, b->out_pos); + + /* + * As an exception, if the next filter returned XZ_STREAM_END, + * we can do that too, since the last few bytes that remain + * unfiltered are meant to remain unfiltered. + */ + if (s->ret == XZ_STREAM_END) + return XZ_STREAM_END; + + s->temp.size = b->out_pos - out_start; + b->out_pos -= s->temp.size; + memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size); + } + + /* + * If we have unfiltered data in temp, try to fill by decoding more + * data from the next filter. Apply the BCJ filter on temp. Then we + * hopefully can fill the actual output buffer by copying filtered + * data from temp. A mix of filtered and unfiltered data may be left + * in temp; it will be taken care on the next call to this function. + */ + if (s->temp.size > 0) { + /* Make b->out{,_pos,_size} temporarily point to s->temp. */ + s->out = b->out; + s->out_pos = b->out_pos; + s->out_size = b->out_size; + b->out = s->temp.buf; + b->out_pos = s->temp.size; + b->out_size = sizeof(s->temp.buf); + + s->ret = xz_dec_lzma2_run(lzma2, b); + + s->temp.size = b->out_pos; + b->out = s->out; + b->out_pos = s->out_pos; + b->out_size = s->out_size; + + if (s->ret != XZ_OK && s->ret != XZ_STREAM_END) + return s->ret; + + bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size); + + /* + * If the next filter returned XZ_STREAM_END, we mark that + * everything is filtered, since the last unfiltered bytes + * of the stream are meant to be left as is. + */ + if (s->ret == XZ_STREAM_END) + s->temp.filtered = s->temp.size; + + bcj_flush(s, b); + if (s->temp.filtered > 0) + return XZ_OK; + } + + return s->ret; +} + +XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool single_call) +{ + struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s != NULL) + s->single_call = single_call; + + return s; +} + +XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id) +{ + switch (id) { +#ifdef XZ_DEC_X86 + case BCJ_X86: +#endif +#ifdef XZ_DEC_POWERPC + case BCJ_POWERPC: +#endif +#ifdef XZ_DEC_IA64 + case BCJ_IA64: +#endif +#ifdef XZ_DEC_ARM + case BCJ_ARM: +#endif +#ifdef XZ_DEC_ARMTHUMB + case BCJ_ARMTHUMB: +#endif +#ifdef XZ_DEC_SPARC + case BCJ_SPARC: +#endif + break; + + default: + /* Unsupported Filter ID */ + return XZ_OPTIONS_ERROR; + } + + s->type = id; + s->ret = XZ_OK; + s->pos = 0; + s->x86_prev_mask = 0; + s->temp.filtered = 0; + s->temp.size = 0; + + return XZ_OK; +} + +#endif diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c new file mode 100644 index 000000000000..ea5fa4fe9d67 --- /dev/null +++ b/lib/xz/xz_dec_lzma2.c @@ -0,0 +1,1171 @@ +/* + * LZMA2 decoder + * + * Authors: Lasse Collin <lasse.collin@tukaani.org> + * Igor Pavlov <http://7-zip.org/> + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#include "xz_private.h" +#include "xz_lzma2.h" + +/* + * Range decoder initialization eats the first five bytes of each LZMA chunk. + */ +#define RC_INIT_BYTES 5 + +/* + * Minimum number of usable input buffer to safely decode one LZMA symbol. + * The worst case is that we decode 22 bits using probabilities and 26 + * direct bits. This may decode at maximum of 20 bytes of input. However, + * lzma_main() does an extra normalization before returning, thus we + * need to put 21 here. + */ +#define LZMA_IN_REQUIRED 21 + +/* + * Dictionary (history buffer) + * + * These are always true: + * start <= pos <= full <= end + * pos <= limit <= end + * + * In multi-call mode, also these are true: + * end == size + * size <= size_max + * allocated <= size + * + * Most of these variables are size_t to support single-call mode, + * in which the dictionary variables address the actual output + * buffer directly. + */ +struct dictionary { + /* Beginning of the history buffer */ + uint8_t *buf; + + /* Old position in buf (before decoding more data) */ + size_t start; + + /* Position in buf */ + size_t pos; + + /* + * How full dictionary is. This is used to detect corrupt input that + * would read beyond the beginning of the uncompressed stream. + */ + size_t full; + + /* Write limit; we don't write to buf[limit] or later bytes. */ + size_t limit; + + /* + * End of the dictionary buffer. In multi-call mode, this is + * the same as the dictionary size. In single-call mode, this + * indicates the size of the output buffer. + */ + size_t end; + + /* + * Size of the dictionary as specified in Block Header. This is used + * together with "full" to detect corrupt input that would make us + * read beyond the beginning of the uncompressed stream. + */ + uint32_t size; + + /* + * Maximum allowed dictionary size in multi-call mode. + * This is ignored in single-call mode. + */ + uint32_t size_max; + + /* + * Amount of memory currently allocated for the dictionary. + * This is used only with XZ_DYNALLOC. (With XZ_PREALLOC, + * size_max is always the same as the allocated size.) + */ + uint32_t allocated; + + /* Operation mode */ + enum xz_mode mode; +}; + +/* Range decoder */ +struct rc_dec { + uint32_t range; + uint32_t code; + + /* + * Number of initializing bytes remaining to be read + * by rc_read_init(). + */ + uint32_t init_bytes_left; + + /* + * Buffer from which we read our input. It can be either + * temp.buf or the caller-provided input buffer. + */ + const uint8_t *in; + size_t in_pos; + size_t in_limit; +}; + +/* Probabilities for a length decoder. */ +struct lzma_len_dec { + /* Probability of match length being at least 10 */ + uint16_t choice; + + /* Probability of match length being at least 18 */ + uint16_t choice2; + + /* Probabilities for match lengths 2-9 */ + uint16_t low[POS_STATES_MAX][LEN_LOW_SYMBOLS]; + + /* Probabilities for match lengths 10-17 */ + uint16_t mid[POS_STATES_MAX][LEN_MID_SYMBOLS]; + + /* Probabilities for match lengths 18-273 */ + uint16_t high[LEN_HIGH_SYMBOLS]; +}; + +struct lzma_dec { + /* Distances of latest four matches */ + uint32_t rep0; + uint32_t rep1; + uint32_t rep2; + uint32_t rep3; + + /* Types of the most recently seen LZMA symbols */ + enum lzma_state state; + + /* + * Length of a match. This is updated so that dict_repeat can + * be called again to finish repeating the whole match. + */ + uint32_t len; + + /* + * LZMA properties or related bit masks (number of literal + * context bits, a mask dervied from the number of literal + * position bits, and a mask dervied from the number + * position bits) + */ + uint32_t lc; + uint32_t literal_pos_mask; /* (1 << lp) - 1 */ + uint32_t pos_mask; /* (1 << pb) - 1 */ + + /* If 1, it's a match. Otherwise it's a single 8-bit literal. */ + uint16_t is_match[STATES][POS_STATES_MAX]; + + /* If 1, it's a repeated match. The distance is one of rep0 .. rep3. */ + uint16_t is_rep[STATES]; + + /* + * If 0, distance of a repeated match is rep0. + * Otherwise check is_rep1. + */ + uint16_t is_rep0[STATES]; + + /* + * If 0, distance of a repeated match is rep1. + * Otherwise check is_rep2. + */ + uint16_t is_rep1[STATES]; + + /* If 0, distance of a repeated match is rep2. Otherwise it is rep3. */ + uint16_t is_rep2[STATES]; + + /* + * If 1, the repeated match has length of one byte. Otherwise + * the length is decoded from rep_len_decoder. + */ + uint16_t is_rep0_long[STATES][POS_STATES_MAX]; + + /* + * Probability tree for the highest two bits of the match + * distance. There is a separate probability tree for match + * lengths of 2 (i.e. MATCH_LEN_MIN), 3, 4, and [5, 273]. + */ + uint16_t dist_slot[DIST_STATES][DIST_SLOTS]; + + /* + * Probility trees for additional bits for match distance + * when the distance is in the range [4, 127]. + */ + uint16_t dist_special[FULL_DISTANCES - DIST_MODEL_END]; + + /* + * Probability tree for the lowest four bits of a match + * distance that is equal to or greater than 128. + */ + uint16_t dist_align[ALIGN_SIZE]; + + /* Length of a normal match */ + struct lzma_len_dec match_len_dec; + + /* Length of a repeated match */ + struct lzma_len_dec rep_len_dec; + + /* Probabilities of literals */ + uint16_t literal[LITERAL_CODERS_MAX][LITERAL_CODER_SIZE]; +}; + +struct lzma2_dec { + /* Position in xz_dec_lzma2_run(). */ + enum lzma2_seq { + SEQ_CONTROL, + SEQ_UNCOMPRESSED_1, + SEQ_UNCOMPRESSED_2, + SEQ_COMPRESSED_0, + SEQ_COMPRESSED_1, + SEQ_PROPERTIES, + SEQ_LZMA_PREPARE, + SEQ_LZMA_RUN, + SEQ_COPY + } sequence; + + /* Next position after decoding the compressed size of the chunk. */ + enum lzma2_seq next_sequence; + + /* Uncompressed size of LZMA chunk (2 MiB at maximum) */ + uint32_t uncompressed; + + /* + * Compressed size of LZMA chunk or compressed/uncompressed + * size of uncompressed chunk (64 KiB at maximum) + */ + uint32_t compressed; + + /* + * True if dictionary reset is needed. This is false before + * the first chunk (LZMA or uncompressed). + */ + bool need_dict_reset; + + /* + * True if new LZMA properties are needed. This is false + * before the first LZMA chunk. + */ + bool need_props; +}; + +struct xz_dec_lzma2 { + /* + * The order below is important on x86 to reduce code size and + * it shouldn't hurt on other platforms. Everything up to and + * including lzma.pos_mask are in the first 128 bytes on x86-32, + * which allows using smaller instructions to access those + * variables. On x86-64, fewer variables fit into the first 128 + * bytes, but this is still the best order without sacrificing + * the readability by splitting the structures. + */ + struct rc_dec rc; + struct dictionary dict; + struct lzma2_dec lzma2; + struct lzma_dec lzma; + + /* + * Temporary buffer which holds small number of input bytes between + * decoder calls. See lzma2_lzma() for details. + */ + struct { + uint32_t size; + uint8_t buf[3 * LZMA_IN_REQUIRED]; + } temp; +}; + +/************** + * Dictionary * + **************/ + +/* + * Reset the dictionary state. When in single-call mode, set up the beginning + * of the dictionary to point to the actual output buffer. + */ +static void dict_reset(struct dictionary *dict, struct xz_buf *b) +{ + if (DEC_IS_SINGLE(dict->mode)) { + dict->buf = b->out + b->out_pos; + dict->end = b->out_size - b->out_pos; + } + + dict->start = 0; + dict->pos = 0; + dict->limit = 0; + dict->full = 0; +} + +/* Set dictionary write limit */ +static void dict_limit(struct dictionary *dict, size_t out_max) +{ + if (dict->end - dict->pos <= out_max) + dict->limit = dict->end; + else + dict->limit = dict->pos + out_max; +} + +/* Return true if at least one byte can be written into the dictionary. */ +static inline bool dict_has_space(const struct dictionary *dict) +{ + return dict->pos < dict->limit; +} + +/* + * Get a byte from the dictionary at the given distance. The distance is + * assumed to valid, or as a special case, zero when the dictionary is + * still empty. This special case is needed for single-call decoding to + * avoid writing a '\0' to the end of the destination buffer. + */ +static inline uint32_t dict_get(const struct dictionary *dict, uint32_t dist) +{ + size_t offset = dict->pos - dist - 1; + + if (dist >= dict->pos) + offset += dict->end; + + return dict->full > 0 ? dict->buf[offset] : 0; +} + +/* + * Put one byte into the dictionary. It is assumed that there is space for it. + */ +static inline void dict_put(struct dictionary *dict, uint8_t byte) +{ + dict->buf[dict->pos++] = byte; + + if (dict->full < dict->pos) + dict->full = dict->pos; +} + +/* + * Repeat given number of bytes from the given distance. If the distance is + * invalid, false is returned. On success, true is returned and *len is + * updated to indicate how many bytes were left to be repeated. + */ +static bool dict_repeat(struct dictionary *dict, uint32_t *len, uint32_t dist) +{ + size_t back; + uint32_t left; + + if (dist >= dict->full || dist >= dict->size) + return false; + + left = min_t(size_t, dict->limit - dict->pos, *len); + *len -= left; + + back = dict->pos - dist - 1; + if (dist >= dict->pos) + back += dict->end; + + do { + dict->buf[dict->pos++] = dict->buf[back++]; + if (back == dict->end) + back = 0; + } while (--left > 0); + + if (dict->full < dict->pos) + dict->full = dict->pos; + + return true; +} + +/* Copy uncompressed data as is from input to dictionary and output buffers. */ +static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b, + uint32_t *left) +{ + size_t copy_size; + + while (*left > 0 && b->in_pos < b->in_size + && b->out_pos < b->out_size) { + copy_size = min(b->in_size - b->in_pos, + b->out_size - b->out_pos); + if (copy_size > dict->end - dict->pos) + copy_size = dict->end - dict->pos; + if (copy_size > *left) + copy_size = *left; + + *left -= copy_size; + + memcpy(dict->buf + dict->pos, b->in + b->in_pos, copy_size); + dict->pos += copy_size; + + if (dict->full < dict->pos) + dict->full = dict->pos; + + if (DEC_IS_MULTI(dict->mode)) { + if (dict->pos == dict->end) + dict->pos = 0; + + memcpy(b->out + b->out_pos, b->in + b->in_pos, + copy_size); + } + + dict->start = dict->pos; + + b->out_pos += copy_size; + b->in_pos += copy_size; + } +} + +/* + * Flush pending data from dictionary to b->out. It is assumed that there is + * enough space in b->out. This is guaranteed because caller uses dict_limit() + * before decoding data into the dictionary. + */ +static uint32_t dict_flush(struct dictionary *dict, struct xz_buf *b) +{ + size_t copy_size = dict->pos - dict->start; + + if (DEC_IS_MULTI(dict->mode)) { + if (dict->pos == dict->end) + dict->pos = 0; + + memcpy(b->out + b->out_pos, dict->buf + dict->start, + copy_size); + } + + dict->start = dict->pos; + b->out_pos += copy_size; + return copy_size; +} + +/***************** + * Range decoder * + *****************/ + +/* Reset the range decoder. */ +static void rc_reset(struct rc_dec *rc) +{ + rc->range = (uint32_t)-1; + rc->code = 0; + rc->init_bytes_left = RC_INIT_BYTES; +} + +/* + * Read the first five initial bytes into rc->code if they haven't been + * read already. (Yes, the first byte gets completely ignored.) + */ +static bool rc_read_init(struct rc_dec *rc, struct xz_buf *b) +{ + while (rc->init_bytes_left > 0) { + if (b->in_pos == b->in_size) + return false; + + rc->code = (rc->code << 8) + b->in[b->in_pos++]; + --rc->init_bytes_left; + } + + return true; +} + +/* Return true if there may not be enough input for the next decoding loop. */ +static inline bool rc_limit_exceeded(const struct rc_dec *rc) +{ + return rc->in_pos > rc->in_limit; +} + +/* + * Return true if it is possible (from point of view of range decoder) that + * we have reached the end of the LZMA chunk. + */ +static inline bool rc_is_finished(const struct rc_dec *rc) +{ + return rc->code == 0; +} + +/* Read the next input byte if needed. */ +static __always_inline void rc_normalize(struct rc_dec *rc) +{ + if (rc->range < RC_TOP_VALUE) { + rc->range <<= RC_SHIFT_BITS; + rc->code = (rc->code << RC_SHIFT_BITS) + rc->in[rc->in_pos++]; + } +} + +/* + * Decode one bit. In some versions, this function has been splitted in three + * functions so that the compiler is supposed to be able to more easily avoid + * an extra branch. In this particular version of the LZMA decoder, this + * doesn't seem to be a good idea (tested with GCC 3.3.6, 3.4.6, and 4.3.3 + * on x86). Using a non-splitted version results in nicer looking code too. + * + * NOTE: This must return an int. Do not make it return a bool or the speed + * of the code generated by GCC 3.x decreases 10-15 %. (GCC 4.3 doesn't care, + * and it generates 10-20 % faster code than GCC 3.x from this file anyway.) + */ +static __always_inline int rc_bit(struct rc_dec *rc, uint16_t *prob) +{ + uint32_t bound; + int bit; + + rc_normalize(rc); + bound = (rc->range >> RC_BIT_MODEL_TOTAL_BITS) * *prob; + if (rc->code < bound) { + rc->range = bound; + *prob += (RC_BIT_MODEL_TOTAL - *prob) >> RC_MOVE_BITS; + bit = 0; + } else { + rc->range -= bound; + rc->code -= bound; + *prob -= *prob >> RC_MOVE_BITS; + bit = 1; + } + + return bit; +} + +/* Decode a bittree starting from the most significant bit. */ +static __always_inline uint32_t rc_bittree(struct rc_dec *rc, + uint16_t *probs, uint32_t limit) +{ + uint32_t symbol = 1; + + do { + if (rc_bit(rc, &probs[symbol])) + symbol = (symbol << 1) + 1; + else + symbol <<= 1; + } while (symbol < limit); + + return symbol; +} + +/* Decode a bittree starting from the least significant bit. */ +static __always_inline void rc_bittree_reverse(struct rc_dec *rc, + uint16_t *probs, + uint32_t *dest, uint32_t limit) +{ + uint32_t symbol = 1; + uint32_t i = 0; + + do { + if (rc_bit(rc, &probs[symbol])) { + symbol = (symbol << 1) + 1; + *dest += 1 << i; + } else { + symbol <<= 1; + } + } while (++i < limit); +} + +/* Decode direct bits (fixed fifty-fifty probability) */ +static inline void rc_direct(struct rc_dec *rc, uint32_t *dest, uint32_t limit) +{ + uint32_t mask; + + do { + rc_normalize(rc); + rc->range >>= 1; + rc->code -= rc->range; + mask = (uint32_t)0 - (rc->code >> 31); + rc->code += rc->range & mask; + *dest = (*dest << 1) + (mask + 1); + } while (--limit > 0); +} + +/******** + * LZMA * + ********/ + +/* Get pointer to literal coder probability array. */ +static uint16_t *lzma_literal_probs(struct xz_dec_lzma2 *s) +{ + uint32_t prev_byte = dict_get(&s->dict, 0); + uint32_t low = prev_byte >> (8 - s->lzma.lc); + uint32_t high = (s->dict.pos & s->lzma.literal_pos_mask) << s->lzma.lc; + return s->lzma.literal[low + high]; +} + +/* Decode a literal (one 8-bit byte) */ +static void lzma_literal(struct xz_dec_lzma2 *s) +{ + uint16_t *probs; + uint32_t symbol; + uint32_t match_byte; + uint32_t match_bit; + uint32_t offset; + uint32_t i; + + probs = lzma_literal_probs(s); + + if (lzma_state_is_literal(s->lzma.state)) { + symbol = rc_bittree(&s->rc, probs, 0x100); + } else { + symbol = 1; + match_byte = dict_get(&s->dict, s->lzma.rep0) << 1; + offset = 0x100; + + do { + match_bit = match_byte & offset; + match_byte <<= 1; + i = offset + match_bit + symbol; + + if (rc_bit(&s->rc, &probs[i])) { + symbol = (symbol << 1) + 1; + offset &= match_bit; + } else { + symbol <<= 1; + offset &= ~match_bit; + } + } while (symbol < 0x100); + } + + dict_put(&s->dict, (uint8_t)symbol); + lzma_state_literal(&s->lzma.state); +} + +/* Decode the length of the match into s->lzma.len. */ +static void lzma_len(struct xz_dec_lzma2 *s, struct lzma_len_dec *l, + uint32_t pos_state) +{ + uint16_t *probs; + uint32_t limit; + + if (!rc_bit(&s->rc, &l->choice)) { + probs = l->low[pos_state]; + limit = LEN_LOW_SYMBOLS; + s->lzma.len = MATCH_LEN_MIN; + } else { + if (!rc_bit(&s->rc, &l->choice2)) { + probs = l->mid[pos_state]; + limit = LEN_MID_SYMBOLS; + s->lzma.len = MATCH_LEN_MIN + LEN_LOW_SYMBOLS; + } else { + probs = l->high; + limit = LEN_HIGH_SYMBOLS; + s->lzma.len = MATCH_LEN_MIN + LEN_LOW_SYMBOLS + + LEN_MID_SYMBOLS; + } + } + + s->lzma.len += rc_bittree(&s->rc, probs, limit) - limit; +} + +/* Decode a match. The distance will be stored in s->lzma.rep0. */ +static void lzma_match(struct xz_dec_lzma2 *s, uint32_t pos_state) +{ + uint16_t *probs; + uint32_t dist_slot; + uint32_t limit; + + lzma_state_match(&s->lzma.state); + + s->lzma.rep3 = s->lzma.rep2; + s->lzma.rep2 = s->lzma.rep1; + s->lzma.rep1 = s->lzma.rep0; + + lzma_len(s, &s->lzma.match_len_dec, pos_state); + + probs = s->lzma.dist_slot[lzma_get_dist_state(s->lzma.len)]; + dist_slot = rc_bittree(&s->rc, probs, DIST_SLOTS) - DIST_SLOTS; + + if (dist_slot < DIST_MODEL_START) { + s->lzma.rep0 = dist_slot; + } else { + limit = (dist_slot >> 1) - 1; + s->lzma.rep0 = 2 + (dist_slot & 1); + + if (dist_slot < DIST_MODEL_END) { + s->lzma.rep0 <<= limit; + probs = s->lzma.dist_special + s->lzma.rep0 + - dist_slot - 1; + rc_bittree_reverse(&s->rc, probs, + &s->lzma.rep0, limit); + } else { + rc_direct(&s->rc, &s->lzma.rep0, limit - ALIGN_BITS); + s->lzma.rep0 <<= ALIGN_BITS; + rc_bittree_reverse(&s->rc, s->lzma.dist_align, + &s->lzma.rep0, ALIGN_BITS); + } + } +} + +/* + * Decode a repeated match. The distance is one of the four most recently + * seen matches. The distance will be stored in s->lzma.rep0. + */ +static void lzma_rep_match(struct xz_dec_lzma2 *s, uint32_t pos_state) +{ + uint32_t tmp; + + if (!rc_bit(&s->rc, &s->lzma.is_rep0[s->lzma.state])) { + if (!rc_bit(&s->rc, &s->lzma.is_rep0_long[ + s->lzma.state][pos_state])) { + lzma_state_short_rep(&s->lzma.state); + s->lzma.len = 1; + return; + } + } else { + if (!rc_bit(&s->rc, &s->lzma.is_rep1[s->lzma.state])) { + tmp = s->lzma.rep1; + } else { + if (!rc_bit(&s->rc, &s->lzma.is_rep2[s->lzma.state])) { + tmp = s->lzma.rep2; + } else { + tmp = s->lzma.rep3; + s->lzma.rep3 = s->lzma.rep2; + } + + s->lzma.rep2 = s->lzma.rep1; + } + + s->lzma.rep1 = s->lzma.rep0; + s->lzma.rep0 = tmp; + } + + lzma_state_long_rep(&s->lzma.state); + lzma_len(s, &s->lzma.rep_len_dec, pos_state); +} + +/* LZMA decoder core */ +static bool lzma_main(struct xz_dec_lzma2 *s) +{ + uint32_t pos_state; + + /* + * If the dictionary was reached during the previous call, try to + * finish the possibly pending repeat in the dictionary. + */ + if (dict_has_space(&s->dict) && s->lzma.len > 0) + dict_repeat(&s->dict, &s->lzma.len, s->lzma.rep0); + + /* + * Decode more LZMA symbols. One iteration may consume up to + * LZMA_IN_REQUIRED - 1 bytes. + */ + while (dict_has_space(&s->dict) && !rc_limit_exceeded(&s->rc)) { + pos_state = s->dict.pos & s->lzma.pos_mask; + + if (!rc_bit(&s->rc, &s->lzma.is_match[ + s->lzma.state][pos_state])) { + lzma_literal(s); + } else { + if (rc_bit(&s->rc, &s->lzma.is_rep[s->lzma.state])) + lzma_rep_match(s, pos_state); + else + lzma_match(s, pos_state); + + if (!dict_repeat(&s->dict, &s->lzma.len, s->lzma.rep0)) + return false; + } + } + + /* + * Having the range decoder always normalized when we are outside + * this function makes it easier to correctly handle end of the chunk. + */ + rc_normalize(&s->rc); + + return true; +} + +/* + * Reset the LZMA decoder and range decoder state. Dictionary is nore reset + * here, because LZMA state may be reset without resetting the dictionary. + */ +static void lzma_reset(struct xz_dec_lzma2 *s) +{ + uint16_t *probs; + size_t i; + + s->lzma.state = STATE_LIT_LIT; + s->lzma.rep0 = 0; + s->lzma.rep1 = 0; + s->lzma.rep2 = 0; + s->lzma.rep3 = 0; + + /* + * All probabilities are initialized to the same value. This hack + * makes the code smaller by avoiding a separate loop for each + * probability array. + * + * This could be optimized so that only that part of literal + * probabilities that are actually required. In the common case + * we would write 12 KiB less. + */ + probs = s->lzma.is_match[0]; + for (i = 0; i < PROBS_TOTAL; ++i) + probs[i] = RC_BIT_MODEL_TOTAL / 2; + + rc_reset(&s->rc); +} + +/* + * Decode and validate LZMA properties (lc/lp/pb) and calculate the bit masks + * from the decoded lp and pb values. On success, the LZMA decoder state is + * reset and true is returned. + */ +static bool lzma_props(struct xz_dec_lzma2 *s, uint8_t props) +{ + if (props > (4 * 5 + 4) * 9 + 8) + return false; + + s->lzma.pos_mask = 0; + while (props >= 9 * 5) { + props -= 9 * 5; + ++s->lzma.pos_mask; + } + + s->lzma.pos_mask = (1 << s->lzma.pos_mask) - 1; + + s->lzma.literal_pos_mask = 0; + while (props >= 9) { + props -= 9; + ++s->lzma.literal_pos_mask; + } + + s->lzma.lc = props; + + if (s->lzma.lc + s->lzma.literal_pos_mask > 4) + return false; + + s->lzma.literal_pos_mask = (1 << s->lzma.literal_pos_mask) - 1; + + lzma_reset(s); + + return true; +} + +/********* + * LZMA2 * + *********/ + +/* + * The LZMA decoder assumes that if the input limit (s->rc.in_limit) hasn't + * been exceeded, it is safe to read up to LZMA_IN_REQUIRED bytes. This + * wrapper function takes care of making the LZMA decoder's assumption safe. + * + * As long as there is plenty of input left to be decoded in the current LZMA + * chunk, we decode directly from the caller-supplied input buffer until + * there's LZMA_IN_REQUIRED bytes left. Those remaining bytes are copied into + * s->temp.buf, which (hopefully) gets filled on the next call to this + * function. We decode a few bytes from the temporary buffer so that we can + * continue decoding from the caller-supplied input buffer again. + */ +static bool lzma2_lzma(struct xz_dec_lzma2 *s, struct xz_buf *b) +{ + size_t in_avail; + uint32_t tmp; + + in_avail = b->in_size - b->in_pos; + if (s->temp.size > 0 || s->lzma2.compressed == 0) { + tmp = 2 * LZMA_IN_REQUIRED - s->temp.size; + if (tmp > s->lzma2.compressed - s->temp.size) + tmp = s->lzma2.compressed - s->temp.size; + if (tmp > in_avail) + tmp = in_avail; + + memcpy(s->temp.buf + s->temp.size, b->in + b->in_pos, tmp); + + if (s->temp.size + tmp == s->lzma2.compressed) { + memzero(s->temp.buf + s->temp.size + tmp, + sizeof(s->temp.buf) + - s->temp.size - tmp); + s->rc.in_limit = s->temp.size + tmp; + } else if (s->temp.size + tmp < LZMA_IN_REQUIRED) { + s->temp.size += tmp; + b->in_pos += tmp; + return true; + } else { + s->rc.in_limit = s->temp.size + tmp - LZMA_IN_REQUIRED; + } + + s->rc.in = s->temp.buf; + s->rc.in_pos = 0; + + if (!lzma_main(s) || s->rc.in_pos > s->temp.size + tmp) + return false; + + s->lzma2.compressed -= s->rc.in_pos; + + if (s->rc.in_pos < s->temp.size) { + s->temp.size -= s->rc.in_pos; + memmove(s->temp.buf, s->temp.buf + s->rc.in_pos, + s->temp.size); + return true; + } + + b->in_pos += s->rc.in_pos - s->temp.size; + s->temp.size = 0; + } + + in_avail = b->in_size - b->in_pos; + if (in_avail >= LZMA_IN_REQUIRED) { + s->rc.in = b->in; + s->rc.in_pos = b->in_pos; + + if (in_avail >= s->lzma2.compressed + LZMA_IN_REQUIRED) + s->rc.in_limit = b->in_pos + s->lzma2.compressed; + else + s->rc.in_limit = b->in_size - LZMA_IN_REQUIRED; + + if (!lzma_main(s)) + return false; + + in_avail = s->rc.in_pos - b->in_pos; + if (in_avail > s->lzma2.compressed) + return false; + + s->lzma2.compressed -= in_avail; + b->in_pos = s->rc.in_pos; + } + + in_avail = b->in_size - b->in_pos; + if (in_avail < LZMA_IN_REQUIRED) { + if (in_avail > s->lzma2.compressed) + in_avail = s->lzma2.compressed; + + memcpy(s->temp.buf, b->in + b->in_pos, in_avail); + s->temp.size = in_avail; + b->in_pos += in_avail; + } + + return true; +} + +/* + * Take care of the LZMA2 control layer, and forward the job of actual LZMA + * decoding or copying of uncompressed chunks to other functions. + */ +XZ_EXTERN enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s, + struct xz_buf *b) +{ + uint32_t tmp; + + while (b->in_pos < b->in_size || s->lzma2.sequence == SEQ_LZMA_RUN) { + switch (s->lzma2.sequence) { + case SEQ_CONTROL: + /* + * LZMA2 control byte + * + * Exact values: + * 0x00 End marker + * 0x01 Dictionary reset followed by + * an uncompressed chunk + * 0x02 Uncompressed chunk (no dictionary reset) + * + * Highest three bits (s->control & 0xE0): + * 0xE0 Dictionary reset, new properties and state + * reset, followed by LZMA compressed chunk + * 0xC0 New properties and state reset, followed + * by LZMA compressed chunk (no dictionary + * reset) + * 0xA0 State reset using old properties, + * followed by LZMA compressed chunk (no + * dictionary reset) + * 0x80 LZMA chunk (no dictionary or state reset) + * + * For LZMA compressed chunks, the lowest five bits + * (s->control & 1F) are the highest bits of the + * uncompressed size (bits 16-20). + * + * A new LZMA2 stream must begin with a dictionary + * reset. The first LZMA chunk must set new + * properties and reset the LZMA state. + * + * Values that don't match anything described above + * are invalid and we return XZ_DATA_ERROR. + */ + tmp = b->in[b->in_pos++]; + + if (tmp >= 0xE0 || tmp == 0x01) { + s->lzma2.need_props = true; + s->lzma2.need_dict_reset = false; + dict_reset(&s->dict, b); + } else if (s->lzma2.need_dict_reset) { + return XZ_DATA_ERROR; + } + + if (tmp >= 0x80) { + s->lzma2.uncompressed = (tmp & 0x1F) << 16; + s->lzma2.sequence = SEQ_UNCOMPRESSED_1; + + if (tmp >= 0xC0) { + /* + * When there are new properties, + * state reset is done at + * SEQ_PROPERTIES. + */ + s->lzma2.need_props = false; + s->lzma2.next_sequence + = SEQ_PROPERTIES; + + } else if (s->lzma2.need_props) { + return XZ_DATA_ERROR; + + } else { + s->lzma2.next_sequence + = SEQ_LZMA_PREPARE; + if (tmp >= 0xA0) + lzma_reset(s); + } + } else { + if (tmp == 0x00) + return XZ_STREAM_END; + + if (tmp > 0x02) + return XZ_DATA_ERROR; + + s->lzma2.sequence = SEQ_COMPRESSED_0; + s->lzma2.next_sequence = SEQ_COPY; + } + + break; + + case SEQ_UNCOMPRESSED_1: + s->lzma2.uncompressed + += (uint32_t)b->in[b->in_pos++] << 8; + s->lzma2.sequence = SEQ_UNCOMPRESSED_2; + break; + + case SEQ_UNCOMPRESSED_2: + s->lzma2.uncompressed + += (uint32_t)b->in[b->in_pos++] + 1; + s->lzma2.sequence = SEQ_COMPRESSED_0; + break; + + case SEQ_COMPRESSED_0: + s->lzma2.compressed + = (uint32_t)b->in[b->in_pos++] << 8; + s->lzma2.sequence = SEQ_COMPRESSED_1; + break; + + case SEQ_COMPRESSED_1: + s->lzma2.compressed + += (uint32_t)b->in[b->in_pos++] + 1; + s->lzma2.sequence = s->lzma2.next_sequence; + break; + + case SEQ_PROPERTIES: + if (!lzma_props(s, b->in[b->in_pos++])) + return XZ_DATA_ERROR; + + s->lzma2.sequence = SEQ_LZMA_PREPARE; + + case SEQ_LZMA_PREPARE: + if (s->lzma2.compressed < RC_INIT_BYTES) + return XZ_DATA_ERROR; + + if (!rc_read_init(&s->rc, b)) + return XZ_OK; + + s->lzma2.compressed -= RC_INIT_BYTES; + s->lzma2.sequence = SEQ_LZMA_RUN; + + case SEQ_LZMA_RUN: + /* + * Set dictionary limit to indicate how much we want + * to be encoded at maximum. Decode new data into the + * dictionary. Flush the new data from dictionary to + * b->out. Check if we finished decoding this chunk. + * In case the dictionary got full but we didn't fill + * the output buffer yet, we may run this loop + * multiple times without changing s->lzma2.sequence. + */ + dict_limit(&s->dict, min_t(size_t, + b->out_size - b->out_pos, + s->lzma2.uncompressed)); + if (!lzma2_lzma(s, b)) + return XZ_DATA_ERROR; + + s->lzma2.uncompressed -= dict_flush(&s->dict, b); + + if (s->lzma2.uncompressed == 0) { + if (s->lzma2.compressed > 0 || s->lzma.len > 0 + || !rc_is_finished(&s->rc)) + return XZ_DATA_ERROR; + + rc_reset(&s->rc); + s->lzma2.sequence = SEQ_CONTROL; + + } else if (b->out_pos == b->out_size + || (b->in_pos == b->in_size + && s->temp.size + < s->lzma2.compressed)) { + return XZ_OK; + } + + break; + + case SEQ_COPY: + dict_uncompressed(&s->dict, b, &s->lzma2.compressed); + if (s->lzma2.compressed > 0) + return XZ_OK; + + s->lzma2.sequence = SEQ_CONTROL; + break; + } + } + + return XZ_OK; +} + +XZ_EXTERN struct xz_dec_lzma2 *xz_dec_lzma2_create(enum xz_mode mode, + uint32_t dict_max) +{ + struct xz_dec_lzma2 *s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) + return NULL; + + s->dict.mode = mode; + s->dict.size_max = dict_max; + + if (DEC_IS_PREALLOC(mode)) { + s->dict.buf = vmalloc(dict_max); + if (s->dict.buf == NULL) { + kfree(s); + return NULL; + } + } else if (DEC_IS_DYNALLOC(mode)) { + s->dict.buf = NULL; + s->dict.allocated = 0; + } + + return s; +} + +XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props) +{ + /* This limits dictionary size to 3 GiB to keep parsing simpler. */ + if (props > 39) + return XZ_OPTIONS_ERROR; + + s->dict.size = 2 + (props & 1); + s->dict.size <<= (props >> 1) + 11; + + if (DEC_IS_MULTI(s->dict.mode)) { + if (s->dict.size > s->dict.size_max) + return XZ_MEMLIMIT_ERROR; + + s->dict.end = s->dict.size; + + if (DEC_IS_DYNALLOC(s->dict.mode)) { + if (s->dict.allocated < s->dict.size) { + vfree(s->dict.buf); + s->dict.buf = vmalloc(s->dict.size); + if (s->dict.buf == NULL) { + s->dict.allocated = 0; + return XZ_MEM_ERROR; + } + } + } + } + + s->lzma.len = 0; + + s->lzma2.sequence = SEQ_CONTROL; + s->lzma2.need_dict_reset = true; + + s->temp.size = 0; + + return XZ_OK; +} + +XZ_EXTERN void xz_dec_lzma2_end(struct xz_dec_lzma2 *s) +{ + if (DEC_IS_MULTI(s->dict.mode)) + vfree(s->dict.buf); + + kfree(s); +} diff --git a/lib/xz/xz_dec_stream.c b/lib/xz/xz_dec_stream.c new file mode 100644 index 000000000000..ac809b1e64f7 --- /dev/null +++ b/lib/xz/xz_dec_stream.c @@ -0,0 +1,821 @@ +/* + * .xz Stream decoder + * + * Author: Lasse Collin <lasse.collin@tukaani.org> + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#include "xz_private.h" +#include "xz_stream.h" + +/* Hash used to validate the Index field */ +struct xz_dec_hash { + vli_type unpadded; + vli_type uncompressed; + uint32_t crc32; +}; + +struct xz_dec { + /* Position in dec_main() */ + enum { + SEQ_STREAM_HEADER, + SEQ_BLOCK_START, + SEQ_BLOCK_HEADER, + SEQ_BLOCK_UNCOMPRESS, + SEQ_BLOCK_PADDING, + SEQ_BLOCK_CHECK, + SEQ_INDEX, + SEQ_INDEX_PADDING, + SEQ_INDEX_CRC32, + SEQ_STREAM_FOOTER + } sequence; + + /* Position in variable-length integers and Check fields */ + uint32_t pos; + + /* Variable-length integer decoded by dec_vli() */ + vli_type vli; + + /* Saved in_pos and out_pos */ + size_t in_start; + size_t out_start; + + /* CRC32 value in Block or Index */ + uint32_t crc32; + + /* Type of the integrity check calculated from uncompressed data */ + enum xz_check check_type; + + /* Operation mode */ + enum xz_mode mode; + + /* + * True if the next call to xz_dec_run() is allowed to return + * XZ_BUF_ERROR. + */ + bool allow_buf_error; + + /* Information stored in Block Header */ + struct { + /* + * Value stored in the Compressed Size field, or + * VLI_UNKNOWN if Compressed Size is not present. + */ + vli_type compressed; + + /* + * Value stored in the Uncompressed Size field, or + * VLI_UNKNOWN if Uncompressed Size is not present. + */ + vli_type uncompressed; + + /* Size of the Block Header field */ + uint32_t size; + } block_header; + + /* Information collected when decoding Blocks */ + struct { + /* Observed compressed size of the current Block */ + vli_type compressed; + + /* Observed uncompressed size of the current Block */ + vli_type uncompressed; + + /* Number of Blocks decoded so far */ + vli_type count; + + /* + * Hash calculated from the Block sizes. This is used to + * validate the Index field. + */ + struct xz_dec_hash hash; + } block; + + /* Variables needed when verifying the Index field */ + struct { + /* Position in dec_index() */ + enum { + SEQ_INDEX_COUNT, + SEQ_INDEX_UNPADDED, + SEQ_INDEX_UNCOMPRESSED + } sequence; + + /* Size of the Index in bytes */ + vli_type size; + + /* Number of Records (matches block.count in valid files) */ + vli_type count; + + /* + * Hash calculated from the Records (matches block.hash in + * valid files). + */ + struct xz_dec_hash hash; + } index; + + /* + * Temporary buffer needed to hold Stream Header, Block Header, + * and Stream Footer. The Block Header is the biggest (1 KiB) + * so we reserve space according to that. buf[] has to be aligned + * to a multiple of four bytes; the size_t variables before it + * should guarantee this. + */ + struct { + size_t pos; + size_t size; + uint8_t buf[1024]; + } temp; + + struct xz_dec_lzma2 *lzma2; + +#ifdef XZ_DEC_BCJ + struct xz_dec_bcj *bcj; + bool bcj_active; +#endif +}; + +#ifdef XZ_DEC_ANY_CHECK +/* Sizes of the Check field with different Check IDs */ +static const uint8_t check_sizes[16] = { + 0, + 4, 4, 4, + 8, 8, 8, + 16, 16, 16, + 32, 32, 32, + 64, 64, 64 +}; +#endif + +/* + * Fill s->temp by copying data starting from b->in[b->in_pos]. Caller + * must have set s->temp.pos to indicate how much data we are supposed + * to copy into s->temp.buf. Return true once s->temp.pos has reached + * s->temp.size. + */ +static bool fill_temp(struct xz_dec *s, struct xz_buf *b) +{ + size_t copy_size = min_t(size_t, + b->in_size - b->in_pos, s->temp.size - s->temp.pos); + + memcpy(s->temp.buf + s->temp.pos, b->in + b->in_pos, copy_size); + b->in_pos += copy_size; + s->temp.pos += copy_size; + + if (s->temp.pos == s->temp.size) { + s->temp.pos = 0; + return true; + } + + return false; +} + +/* Decode a variable-length integer (little-endian base-128 encoding) */ +static enum xz_ret dec_vli(struct xz_dec *s, const uint8_t *in, + size_t *in_pos, size_t in_size) +{ + uint8_t byte; + + if (s->pos == 0) + s->vli = 0; + + while (*in_pos < in_size) { + byte = in[*in_pos]; + ++*in_pos; + + s->vli |= (vli_type)(byte & 0x7F) << s->pos; + + if ((byte & 0x80) == 0) { + /* Don't allow non-minimal encodings. */ + if (byte == 0 && s->pos != 0) + return XZ_DATA_ERROR; + + s->pos = 0; + return XZ_STREAM_END; + } + + s->pos += 7; + if (s->pos == 7 * VLI_BYTES_MAX) + return XZ_DATA_ERROR; + } + + return XZ_OK; +} + +/* + * Decode the Compressed Data field from a Block. Update and validate + * the observed compressed and uncompressed sizes of the Block so that + * they don't exceed the values possibly stored in the Block Header + * (validation assumes that no integer overflow occurs, since vli_type + * is normally uint64_t). Update the CRC32 if presence of the CRC32 + * field was indicated in Stream Header. + * + * Once the decoding is finished, validate that the observed sizes match + * the sizes possibly stored in the Block Header. Update the hash and + * Block count, which are later used to validate the Index field. + */ +static enum xz_ret dec_block(struct xz_dec *s, struct xz_buf *b) +{ + enum xz_ret ret; + + s->in_start = b->in_pos; + s->out_start = b->out_pos; + +#ifdef XZ_DEC_BCJ + if (s->bcj_active) + ret = xz_dec_bcj_run(s->bcj, s->lzma2, b); + else +#endif + ret = xz_dec_lzma2_run(s->lzma2, b); + + s->block.compressed += b->in_pos - s->in_start; + s->block.uncompressed += b->out_pos - s->out_start; + + /* + * There is no need to separately check for VLI_UNKNOWN, since + * the observed sizes are always smaller than VLI_UNKNOWN. + */ + if (s->block.compressed > s->block_header.compressed + || s->block.uncompressed + > s->block_header.uncompressed) + return XZ_DATA_ERROR; + + if (s->check_type == XZ_CHECK_CRC32) + s->crc32 = xz_crc32(b->out + s->out_start, + b->out_pos - s->out_start, s->crc32); + + if (ret == XZ_STREAM_END) { + if (s->block_header.compressed != VLI_UNKNOWN + && s->block_header.compressed + != s->block.compressed) + return XZ_DATA_ERROR; + + if (s->block_header.uncompressed != VLI_UNKNOWN + && s->block_header.uncompressed + != s->block.uncompressed) + return XZ_DATA_ERROR; + + s->block.hash.unpadded += s->block_header.size + + s->block.compressed; + +#ifdef XZ_DEC_ANY_CHECK + s->block.hash.unpadded += check_sizes[s->check_type]; +#else + if (s->check_type == XZ_CHECK_CRC32) + s->block.hash.unpadded += 4; +#endif + + s->block.hash.uncompressed += s->block.uncompressed; + s->block.hash.crc32 = xz_crc32( + (const uint8_t *)&s->block.hash, + sizeof(s->block.hash), s->block.hash.crc32); + + ++s->block.count; + } + + return ret; +} + +/* Update the Index size and the CRC32 value. */ +static void index_update(struct xz_dec *s, const struct xz_buf *b) +{ + size_t in_used = b->in_pos - s->in_start; + s->index.size += in_used; + s->crc32 = xz_crc32(b->in + s->in_start, in_used, s->crc32); +} + +/* + * Decode the Number of Records, Unpadded Size, and Uncompressed Size + * fields from the Index field. That is, Index Padding and CRC32 are not + * decoded by this function. + * + * This can return XZ_OK (more input needed), XZ_STREAM_END (everything + * successfully decoded), or XZ_DATA_ERROR (input is corrupt). + */ +static enum xz_ret dec_index(struct xz_dec *s, struct xz_buf *b) +{ + enum xz_ret ret; + + do { + ret = dec_vli(s, b->in, &b->in_pos, b->in_size); + if (ret != XZ_STREAM_END) { + index_update(s, b); + return ret; + } + + switch (s->index.sequence) { + case SEQ_INDEX_COUNT: + s->index.count = s->vli; + + /* + * Validate that the Number of Records field + * indicates the same number of Records as + * there were Blocks in the Stream. + */ + if (s->index.count != s->block.count) + return XZ_DATA_ERROR; + + s->index.sequence = SEQ_INDEX_UNPADDED; + break; + + case SEQ_INDEX_UNPADDED: + s->index.hash.unpadded += s->vli; + s->index.sequence = SEQ_INDEX_UNCOMPRESSED; + break; + + case SEQ_INDEX_UNCOMPRESSED: + s->index.hash.uncompressed += s->vli; + s->index.hash.crc32 = xz_crc32( + (const uint8_t *)&s->index.hash, + sizeof(s->index.hash), + s->index.hash.crc32); + --s->index.count; + s->index.sequence = SEQ_INDEX_UNPADDED; + break; + } + } while (s->index.count > 0); + + return XZ_STREAM_END; +} + +/* + * Validate that the next four input bytes match the value of s->crc32. + * s->pos must be zero when starting to validate the first byte. + */ +static enum xz_ret crc32_validate(struct xz_dec *s, struct xz_buf *b) +{ + do { + if (b->in_pos == b->in_size) + return XZ_OK; + + if (((s->crc32 >> s->pos) & 0xFF) != b->in[b->in_pos++]) + return XZ_DATA_ERROR; + + s->pos += 8; + + } while (s->pos < 32); + + s->crc32 = 0; + s->pos = 0; + + return XZ_STREAM_END; +} + +#ifdef XZ_DEC_ANY_CHECK +/* + * Skip over the Check field when the Check ID is not supported. + * Returns true once the whole Check field has been skipped over. + */ +static bool check_skip(struct xz_dec *s, struct xz_buf *b) +{ + while (s->pos < check_sizes[s->check_type]) { + if (b->in_pos == b->in_size) + return false; + + ++b->in_pos; + ++s->pos; + } + + s->pos = 0; + + return true; +} +#endif + +/* Decode the Stream Header field (the first 12 bytes of the .xz Stream). */ +static enum xz_ret dec_stream_header(struct xz_dec *s) +{ + if (!memeq(s->temp.buf, HEADER_MAGIC, HEADER_MAGIC_SIZE)) + return XZ_FORMAT_ERROR; + + if (xz_crc32(s->temp.buf + HEADER_MAGIC_SIZE, 2, 0) + != get_le32(s->temp.buf + HEADER_MAGIC_SIZE + 2)) + return XZ_DATA_ERROR; + + if (s->temp.buf[HEADER_MAGIC_SIZE] != 0) + return XZ_OPTIONS_ERROR; + + /* + * Of integrity checks, we support only none (Check ID = 0) and + * CRC32 (Check ID = 1). However, if XZ_DEC_ANY_CHECK is defined, + * we will accept other check types too, but then the check won't + * be verified and a warning (XZ_UNSUPPORTED_CHECK) will be given. + */ + s->check_type = s->temp.buf[HEADER_MAGIC_SIZE + 1]; + +#ifdef XZ_DEC_ANY_CHECK + if (s->check_type > XZ_CHECK_MAX) + return XZ_OPTIONS_ERROR; + + if (s->check_type > XZ_CHECK_CRC32) + return XZ_UNSUPPORTED_CHECK; +#else + if (s->check_type > XZ_CHECK_CRC32) + return XZ_OPTIONS_ERROR; +#endif + + return XZ_OK; +} + +/* Decode the Stream Footer field (the last 12 bytes of the .xz Stream) */ +static enum xz_ret dec_stream_footer(struct xz_dec *s) +{ + if (!memeq(s->temp.buf + 10, FOOTER_MAGIC, FOOTER_MAGIC_SIZE)) + return XZ_DATA_ERROR; + + if (xz_crc32(s->temp.buf + 4, 6, 0) != get_le32(s->temp.buf)) + return XZ_DATA_ERROR; + + /* + * Validate Backward Size. Note that we never added the size of the + * Index CRC32 field to s->index.size, thus we use s->index.size / 4 + * instead of s->index.size / 4 - 1. + */ + if ((s->index.size >> 2) != get_le32(s->temp.buf + 4)) + return XZ_DATA_ERROR; + + if (s->temp.buf[8] != 0 || s->temp.buf[9] != s->check_type) + return XZ_DATA_ERROR; + + /* + * Use XZ_STREAM_END instead of XZ_OK to be more convenient + * for the caller. + */ + return XZ_STREAM_END; +} + +/* Decode the Block Header and initialize the filter chain. */ +static enum xz_ret dec_block_header(struct xz_dec *s) +{ + enum xz_ret ret; + + /* + * Validate the CRC32. We know that the temp buffer is at least + * eight bytes so this is safe. + */ + s->temp.size -= 4; + if (xz_crc32(s->temp.buf, s->temp.size, 0) + != get_le32(s->temp.buf + s->temp.size)) + return XZ_DATA_ERROR; + + s->temp.pos = 2; + + /* + * Catch unsupported Block Flags. We support only one or two filters + * in the chain, so we catch that with the same test. + */ +#ifdef XZ_DEC_BCJ + if (s->temp.buf[1] & 0x3E) +#else + if (s->temp.buf[1] & 0x3F) +#endif + return XZ_OPTIONS_ERROR; + + /* Compressed Size */ + if (s->temp.buf[1] & 0x40) { + if (dec_vli(s, s->temp.buf, &s->temp.pos, s->temp.size) + != XZ_STREAM_END) + return XZ_DATA_ERROR; + + s->block_header.compressed = s->vli; + } else { + s->block_header.compressed = VLI_UNKNOWN; + } + + /* Uncompressed Size */ + if (s->temp.buf[1] & 0x80) { + if (dec_vli(s, s->temp.buf, &s->temp.pos, s->temp.size) + != XZ_STREAM_END) + return XZ_DATA_ERROR; + + s->block_header.uncompressed = s->vli; + } else { + s->block_header.uncompressed = VLI_UNKNOWN; + } + +#ifdef XZ_DEC_BCJ + /* If there are two filters, the first one must be a BCJ filter. */ + s->bcj_active = s->temp.buf[1] & 0x01; + if (s->bcj_active) { + if (s->temp.size - s->temp.pos < 2) + return XZ_OPTIONS_ERROR; + + ret = xz_dec_bcj_reset(s->bcj, s->temp.buf[s->temp.pos++]); + if (ret != XZ_OK) + return ret; + + /* + * We don't support custom start offset, + * so Size of Properties must be zero. + */ + if (s->temp.buf[s->temp.pos++] != 0x00) + return XZ_OPTIONS_ERROR; + } +#endif + + /* Valid Filter Flags always take at least two bytes. */ + if (s->temp.size - s->temp.pos < 2) + return XZ_DATA_ERROR; + + /* Filter ID = LZMA2 */ + if (s->temp.buf[s->temp.pos++] != 0x21) + return XZ_OPTIONS_ERROR; + + /* Size of Properties = 1-byte Filter Properties */ + if (s->temp.buf[s->temp.pos++] != 0x01) + return XZ_OPTIONS_ERROR; + + /* Filter Properties contains LZMA2 dictionary size. */ + if (s->temp.size - s->temp.pos < 1) + return XZ_DATA_ERROR; + + ret = xz_dec_lzma2_reset(s->lzma2, s->temp.buf[s->temp.pos++]); + if (ret != XZ_OK) + return ret; + + /* The rest must be Header Padding. */ + while (s->temp.pos < s->temp.size) + if (s->temp.buf[s->temp.pos++] != 0x00) + return XZ_OPTIONS_ERROR; + + s->temp.pos = 0; + s->block.compressed = 0; + s->block.uncompressed = 0; + + return XZ_OK; +} + +static enum xz_ret dec_main(struct xz_dec *s, struct xz_buf *b) +{ + enum xz_ret ret; + + /* + * Store the start position for the case when we are in the middle + * of the Index field. + */ + s->in_start = b->in_pos; + + while (true) { + switch (s->sequence) { + case SEQ_STREAM_HEADER: + /* + * Stream Header is copied to s->temp, and then + * decoded from there. This way if the caller + * gives us only little input at a time, we can + * still keep the Stream Header decoding code + * simple. Similar approach is used in many places + * in this file. + */ + if (!fill_temp(s, b)) + return XZ_OK; + + /* + * If dec_stream_header() returns + * XZ_UNSUPPORTED_CHECK, it is still possible + * to continue decoding if working in multi-call + * mode. Thus, update s->sequence before calling + * dec_stream_header(). + */ + s->sequence = SEQ_BLOCK_START; + + ret = dec_stream_header(s); + if (ret != XZ_OK) + return ret; + + case SEQ_BLOCK_START: + /* We need one byte of input to continue. */ + if (b->in_pos == b->in_size) + return XZ_OK; + + /* See if this is the beginning of the Index field. */ + if (b->in[b->in_pos] == 0) { + s->in_start = b->in_pos++; + s->sequence = SEQ_INDEX; + break; + } + + /* + * Calculate the size of the Block Header and + * prepare to decode it. + */ + s->block_header.size + = ((uint32_t)b->in[b->in_pos] + 1) * 4; + + s->temp.size = s->block_header.size; + s->temp.pos = 0; + s->sequence = SEQ_BLOCK_HEADER; + + case SEQ_BLOCK_HEADER: + if (!fill_temp(s, b)) + return XZ_OK; + + ret = dec_block_header(s); + if (ret != XZ_OK) + return ret; + + s->sequence = SEQ_BLOCK_UNCOMPRESS; + + case SEQ_BLOCK_UNCOMPRESS: + ret = dec_block(s, b); + if (ret != XZ_STREAM_END) + return ret; + + s->sequence = SEQ_BLOCK_PADDING; + + case SEQ_BLOCK_PADDING: + /* + * Size of Compressed Data + Block Padding + * must be a multiple of four. We don't need + * s->block.compressed for anything else + * anymore, so we use it here to test the size + * of the Block Padding field. + */ + while (s->block.compressed & 3) { + if (b->in_pos == b->in_size) + return XZ_OK; + + if (b->in[b->in_pos++] != 0) + return XZ_DATA_ERROR; + + ++s->block.compressed; + } + + s->sequence = SEQ_BLOCK_CHECK; + + case SEQ_BLOCK_CHECK: + if (s->check_type == XZ_CHECK_CRC32) { + ret = crc32_validate(s, b); + if (ret != XZ_STREAM_END) + return ret; + } +#ifdef XZ_DEC_ANY_CHECK + else if (!check_skip(s, b)) { + return XZ_OK; + } +#endif + + s->sequence = SEQ_BLOCK_START; + break; + + case SEQ_INDEX: + ret = dec_index(s, b); + if (ret != XZ_STREAM_END) + return ret; + + s->sequence = SEQ_INDEX_PADDING; + + case SEQ_INDEX_PADDING: + while ((s->index.size + (b->in_pos - s->in_start)) + & 3) { + if (b->in_pos == b->in_size) { + index_update(s, b); + return XZ_OK; + } + + if (b->in[b->in_pos++] != 0) + return XZ_DATA_ERROR; + } + + /* Finish the CRC32 value and Index size. */ + index_update(s, b); + + /* Compare the hashes to validate the Index field. */ + if (!memeq(&s->block.hash, &s->index.hash, + sizeof(s->block.hash))) + return XZ_DATA_ERROR; + + s->sequence = SEQ_INDEX_CRC32; + + case SEQ_INDEX_CRC32: + ret = crc32_validate(s, b); + if (ret != XZ_STREAM_END) + return ret; + + s->temp.size = STREAM_HEADER_SIZE; + s->sequence = SEQ_STREAM_FOOTER; + + case SEQ_STREAM_FOOTER: + if (!fill_temp(s, b)) + return XZ_OK; + + return dec_stream_footer(s); + } + } + + /* Never reached */ +} + +/* + * xz_dec_run() is a wrapper for dec_main() to handle some special cases in + * multi-call and single-call decoding. + * + * In multi-call mode, we must return XZ_BUF_ERROR when it seems clear that we + * are not going to make any progress anymore. This is to prevent the caller + * from calling us infinitely when the input file is truncated or otherwise + * corrupt. Since zlib-style API allows that the caller fills the input buffer + * only when the decoder doesn't produce any new output, we have to be careful + * to avoid returning XZ_BUF_ERROR too easily: XZ_BUF_ERROR is returned only + * after the second consecutive call to xz_dec_run() that makes no progress. + * + * In single-call mode, if we couldn't decode everything and no error + * occurred, either the input is truncated or the output buffer is too small. + * Since we know that the last input byte never produces any output, we know + * that if all the input was consumed and decoding wasn't finished, the file + * must be corrupt. Otherwise the output buffer has to be too small or the + * file is corrupt in a way that decoding it produces too big output. + * + * If single-call decoding fails, we reset b->in_pos and b->out_pos back to + * their original values. This is because with some filter chains there won't + * be any valid uncompressed data in the output buffer unless the decoding + * actually succeeds (that's the price to pay of using the output buffer as + * the workspace). + */ +XZ_EXTERN enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b) +{ + size_t in_start; + size_t out_start; + enum xz_ret ret; + + if (DEC_IS_SINGLE(s->mode)) + xz_dec_reset(s); + + in_start = b->in_pos; + out_start = b->out_pos; + ret = dec_main(s, b); + + if (DEC_IS_SINGLE(s->mode)) { + if (ret == XZ_OK) + ret = b->in_pos == b->in_size + ? XZ_DATA_ERROR : XZ_BUF_ERROR; + + if (ret != XZ_STREAM_END) { + b->in_pos = in_start; + b->out_pos = out_start; + } + + } else if (ret == XZ_OK && in_start == b->in_pos + && out_start == b->out_pos) { + if (s->allow_buf_error) + ret = XZ_BUF_ERROR; + + s->allow_buf_error = true; + } else { + s->allow_buf_error = false; + } + + return ret; +} + +XZ_EXTERN struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max) +{ + struct xz_dec *s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) + return NULL; + + s->mode = mode; + +#ifdef XZ_DEC_BCJ + s->bcj = xz_dec_bcj_create(DEC_IS_SINGLE(mode)); + if (s->bcj == NULL) + goto error_bcj; +#endif + + s->lzma2 = xz_dec_lzma2_create(mode, dict_max); + if (s->lzma2 == NULL) + goto error_lzma2; + + xz_dec_reset(s); + return s; + +error_lzma2: +#ifdef XZ_DEC_BCJ + xz_dec_bcj_end(s->bcj); +error_bcj: +#endif + kfree(s); + return NULL; +} + +XZ_EXTERN void xz_dec_reset(struct xz_dec *s) +{ + s->sequence = SEQ_STREAM_HEADER; + s->allow_buf_error = false; + s->pos = 0; + s->crc32 = 0; + memzero(&s->block, sizeof(s->block)); + memzero(&s->index, sizeof(s->index)); + s->temp.pos = 0; + s->temp.size = STREAM_HEADER_SIZE; +} + +XZ_EXTERN void xz_dec_end(struct xz_dec *s) +{ + if (s != NULL) { + xz_dec_lzma2_end(s->lzma2); +#ifdef XZ_DEC_BCJ + xz_dec_bcj_end(s->bcj); +#endif + kfree(s); + } +} diff --git a/lib/xz/xz_dec_syms.c b/lib/xz/xz_dec_syms.c new file mode 100644 index 000000000000..32eb3c03aede --- /dev/null +++ b/lib/xz/xz_dec_syms.c @@ -0,0 +1,26 @@ +/* + * XZ decoder module information + * + * Author: Lasse Collin <lasse.collin@tukaani.org> + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#include <linux/module.h> +#include <linux/xz.h> + +EXPORT_SYMBOL(xz_dec_init); +EXPORT_SYMBOL(xz_dec_reset); +EXPORT_SYMBOL(xz_dec_run); +EXPORT_SYMBOL(xz_dec_end); + +MODULE_DESCRIPTION("XZ decompressor"); +MODULE_VERSION("1.0"); +MODULE_AUTHOR("Lasse Collin <lasse.collin@tukaani.org> and Igor Pavlov"); + +/* + * This code is in the public domain, but in Linux it's simplest to just + * say it's GPL and consider the authors as the copyright holders. + */ +MODULE_LICENSE("GPL"); diff --git a/lib/xz/xz_dec_test.c b/lib/xz/xz_dec_test.c new file mode 100644 index 000000000000..da28a19d6c98 --- /dev/null +++ b/lib/xz/xz_dec_test.c @@ -0,0 +1,220 @@ +/* + * XZ decoder tester + * + * Author: Lasse Collin <lasse.collin@tukaani.org> + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#include <linux/crc32.h> +#include <linux/xz.h> + +/* Maximum supported dictionary size */ +#define DICT_MAX (1 << 20) + +/* Device name to pass to register_chrdev(). */ +#define DEVICE_NAME "xz_dec_test" + +/* Dynamically allocated device major number */ +static int device_major; + +/* + * We reuse the same decoder state, and thus can decode only one + * file at a time. + */ +static bool device_is_open; + +/* XZ decoder state */ +static struct xz_dec *state; + +/* + * Return value of xz_dec_run(). We need to avoid calling xz_dec_run() after + * it has returned XZ_STREAM_END, so we make this static. + */ +static enum xz_ret ret; + +/* + * Input and output buffers. The input buffer is used as a temporary safe + * place for the data coming from the userspace. + */ +static uint8_t buffer_in[1024]; +static uint8_t buffer_out[1024]; + +/* + * Structure to pass the input and output buffers to the XZ decoder. + * A few of the fields are never modified so we initialize them here. + */ +static struct xz_buf buffers = { + .in = buffer_in, + .out = buffer_out, + .out_size = sizeof(buffer_out) +}; + +/* + * CRC32 of uncompressed data. This is used to give the user a simple way + * to check that the decoder produces correct output. + */ +static uint32_t crc; + +static int xz_dec_test_open(struct inode *i, struct file *f) +{ + if (device_is_open) + return -EBUSY; + + device_is_open = true; + + xz_dec_reset(state); + ret = XZ_OK; + crc = 0xFFFFFFFF; + + buffers.in_pos = 0; + buffers.in_size = 0; + buffers.out_pos = 0; + + printk(KERN_INFO DEVICE_NAME ": opened\n"); + return 0; +} + +static int xz_dec_test_release(struct inode *i, struct file *f) +{ + device_is_open = false; + + if (ret == XZ_OK) + printk(KERN_INFO DEVICE_NAME ": input was truncated\n"); + + printk(KERN_INFO DEVICE_NAME ": closed\n"); + return 0; +} + +/* + * Decode the data given to us from the userspace. CRC32 of the uncompressed + * data is calculated and is printed at the end of successful decoding. The + * uncompressed data isn't stored anywhere for further use. + * + * The .xz file must have exactly one Stream and no Stream Padding. The data + * after the first Stream is considered to be garbage. + */ +static ssize_t xz_dec_test_write(struct file *file, const char __user *buf, + size_t size, loff_t *pos) +{ + size_t remaining; + + if (ret != XZ_OK) { + if (size > 0) + printk(KERN_INFO DEVICE_NAME ": %zu bytes of " + "garbage at the end of the file\n", + size); + + return -ENOSPC; + } + + printk(KERN_INFO DEVICE_NAME ": decoding %zu bytes of input\n", + size); + + remaining = size; + while ((remaining > 0 || buffers.out_pos == buffers.out_size) + && ret == XZ_OK) { + if (buffers.in_pos == buffers.in_size) { + buffers.in_pos = 0; + buffers.in_size = min(remaining, sizeof(buffer_in)); + if (copy_from_user(buffer_in, buf, buffers.in_size)) + return -EFAULT; + + buf += buffers.in_size; + remaining -= buffers.in_size; + } + + buffers.out_pos = 0; + ret = xz_dec_run(state, &buffers); + crc = crc32(crc, buffer_out, buffers.out_pos); + } + + switch (ret) { + case XZ_OK: + printk(KERN_INFO DEVICE_NAME ": XZ_OK\n"); + return size; + + case XZ_STREAM_END: + printk(KERN_INFO DEVICE_NAME ": XZ_STREAM_END, " + "CRC32 = 0x%08X\n", ~crc); + return size - remaining - (buffers.in_size - buffers.in_pos); + + case XZ_MEMLIMIT_ERROR: + printk(KERN_INFO DEVICE_NAME ": XZ_MEMLIMIT_ERROR\n"); + break; + + case XZ_FORMAT_ERROR: + printk(KERN_INFO DEVICE_NAME ": XZ_FORMAT_ERROR\n"); + break; + + case XZ_OPTIONS_ERROR: + printk(KERN_INFO DEVICE_NAME ": XZ_OPTIONS_ERROR\n"); + break; + + case XZ_DATA_ERROR: + printk(KERN_INFO DEVICE_NAME ": XZ_DATA_ERROR\n"); + break; + + case XZ_BUF_ERROR: + printk(KERN_INFO DEVICE_NAME ": XZ_BUF_ERROR\n"); + break; + + default: + printk(KERN_INFO DEVICE_NAME ": Bug detected!\n"); + break; + } + + return -EIO; +} + +/* Allocate the XZ decoder state and register the character device. */ +static int __init xz_dec_test_init(void) +{ + static const struct file_operations fileops = { + .owner = THIS_MODULE, + .open = &xz_dec_test_open, + .release = &xz_dec_test_release, + .write = &xz_dec_test_write + }; + + state = xz_dec_init(XZ_PREALLOC, DICT_MAX); + if (state == NULL) + return -ENOMEM; + + device_major = register_chrdev(0, DEVICE_NAME, &fileops); + if (device_major < 0) { + xz_dec_end(state); + return device_major; + } + + printk(KERN_INFO DEVICE_NAME ": module loaded\n"); + printk(KERN_INFO DEVICE_NAME ": Create a device node with " + "'mknod " DEVICE_NAME " c %d 0' and write .xz files " + "to it.\n", device_major); + return 0; +} + +static void __exit xz_dec_test_exit(void) +{ + unregister_chrdev(device_major, DEVICE_NAME); + xz_dec_end(state); + printk(KERN_INFO DEVICE_NAME ": module unloaded\n"); +} + +module_init(xz_dec_test_init); +module_exit(xz_dec_test_exit); + +MODULE_DESCRIPTION("XZ decompressor tester"); +MODULE_VERSION("1.0"); +MODULE_AUTHOR("Lasse Collin <lasse.collin@tukaani.org>"); + +/* + * This code is in the public domain, but in Linux it's simplest to just + * say it's GPL and consider the authors as the copyright holders. + */ +MODULE_LICENSE("GPL"); diff --git a/lib/xz/xz_lzma2.h b/lib/xz/xz_lzma2.h new file mode 100644 index 000000000000..071d67bee9f5 --- /dev/null +++ b/lib/xz/xz_lzma2.h @@ -0,0 +1,204 @@ +/* + * LZMA2 definitions + * + * Authors: Lasse Collin <lasse.collin@tukaani.org> + * Igor Pavlov <http://7-zip.org/> + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#ifndef XZ_LZMA2_H +#define XZ_LZMA2_H + +/* Range coder constants */ +#define RC_SHIFT_BITS 8 +#define RC_TOP_BITS 24 +#define RC_TOP_VALUE (1 << RC_TOP_BITS) +#define RC_BIT_MODEL_TOTAL_BITS 11 +#define RC_BIT_MODEL_TOTAL (1 << RC_BIT_MODEL_TOTAL_BITS) +#define RC_MOVE_BITS 5 + +/* + * Maximum number of position states. A position state is the lowest pb + * number of bits of the current uncompressed offset. In some places there + * are different sets of probabilities for different position states. + */ +#define POS_STATES_MAX (1 << 4) + +/* + * This enum is used to track which LZMA symbols have occurred most recently + * and in which order. This information is used to predict the next symbol. + * + * Symbols: + * - Literal: One 8-bit byte + * - Match: Repeat a chunk of data at some distance + * - Long repeat: Multi-byte match at a recently seen distance + * - Short repeat: One-byte repeat at a recently seen distance + * + * The symbol names are in from STATE_oldest_older_previous. REP means + * either short or long repeated match, and NONLIT means any non-literal. + */ +enum lzma_state { + STATE_LIT_LIT, + STATE_MATCH_LIT_LIT, + STATE_REP_LIT_LIT, + STATE_SHORTREP_LIT_LIT, + STATE_MATCH_LIT, + STATE_REP_LIT, + STATE_SHORTREP_LIT, + STATE_LIT_MATCH, + STATE_LIT_LONGREP, + STATE_LIT_SHORTREP, + STATE_NONLIT_MATCH, + STATE_NONLIT_REP +}; + +/* Total number of states */ +#define STATES 12 + +/* The lowest 7 states indicate that the previous state was a literal. */ +#define LIT_STATES 7 + +/* Indicate that the latest symbol was a literal. */ +static inline void lzma_state_literal(enum lzma_state *state) +{ + if (*state <= STATE_SHORTREP_LIT_LIT) + *state = STATE_LIT_LIT; + else if (*state <= STATE_LIT_SHORTREP) + *state -= 3; + else + *state -= 6; +} + +/* Indicate that the latest symbol was a match. */ +static inline void lzma_state_match(enum lzma_state *state) +{ + *state = *state < LIT_STATES ? STATE_LIT_MATCH : STATE_NONLIT_MATCH; +} + +/* Indicate that the latest state was a long repeated match. */ +static inline void lzma_state_long_rep(enum lzma_state *state) +{ + *state = *state < LIT_STATES ? STATE_LIT_LONGREP : STATE_NONLIT_REP; +} + +/* Indicate that the latest symbol was a short match. */ +static inline void lzma_state_short_rep(enum lzma_state *state) +{ + *state = *state < LIT_STATES ? STATE_LIT_SHORTREP : STATE_NONLIT_REP; +} + +/* Test if the previous symbol was a literal. */ +static inline bool lzma_state_is_literal(enum lzma_state state) +{ + return state < LIT_STATES; +} + +/* Each literal coder is divided in three sections: + * - 0x001-0x0FF: Without match byte + * - 0x101-0x1FF: With match byte; match bit is 0 + * - 0x201-0x2FF: With match byte; match bit is 1 + * + * Match byte is used when the previous LZMA symbol was something else than + * a literal (that is, it was some kind of match). + */ +#define LITERAL_CODER_SIZE 0x300 + +/* Maximum number of literal coders */ +#define LITERAL_CODERS_MAX (1 << 4) + +/* Minimum length of a match is two bytes. */ +#define MATCH_LEN_MIN 2 + +/* Match length is encoded with 4, 5, or 10 bits. + * + * Length Bits + * 2-9 4 = Choice=0 + 3 bits + * 10-17 5 = Choice=1 + Choice2=0 + 3 bits + * 18-273 10 = Choice=1 + Choice2=1 + 8 bits + */ +#define LEN_LOW_BITS 3 +#define LEN_LOW_SYMBOLS (1 << LEN_LOW_BITS) +#define LEN_MID_BITS 3 +#define LEN_MID_SYMBOLS (1 << LEN_MID_BITS) +#define LEN_HIGH_BITS 8 +#define LEN_HIGH_SYMBOLS (1 << LEN_HIGH_BITS) +#define LEN_SYMBOLS (LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS + LEN_HIGH_SYMBOLS) + +/* + * Maximum length of a match is 273 which is a result of the encoding + * described above. + */ +#define MATCH_LEN_MAX (MATCH_LEN_MIN + LEN_SYMBOLS - 1) + +/* + * Different sets of probabilities are used for match distances that have + * very short match length: Lengths of 2, 3, and 4 bytes have a separate + * set of probabilities for each length. The matches with longer length + * use a shared set of probabilities. + */ +#define DIST_STATES 4 + +/* + * Get the index of the appropriate probability array for decoding + * the distance slot. + */ +static inline uint32_t lzma_get_dist_state(uint32_t len) +{ + return len < DIST_STATES + MATCH_LEN_MIN + ? len - MATCH_LEN_MIN : DIST_STATES - 1; +} + +/* + * The highest two bits of a 32-bit match distance are encoded using six bits. + * This six-bit value is called a distance slot. This way encoding a 32-bit + * value takes 6-36 bits, larger values taking more bits. + */ +#define DIST_SLOT_BITS 6 +#define DIST_SLOTS (1 << DIST_SLOT_BITS) + +/* Match distances up to 127 are fully encoded using probabilities. Since + * the highest two bits (distance slot) are always encoded using six bits, + * the distances 0-3 don't need any additional bits to encode, since the + * distance slot itself is the same as the actual distance. DIST_MODEL_START + * indicates the first distance slot where at least one additional bit is + * needed. + */ +#define DIST_MODEL_START 4 + +/* + * Match distances greater than 127 are encoded in three pieces: + * - distance slot: the highest two bits + * - direct bits: 2-26 bits below the highest two bits + * - alignment bits: four lowest bits + * + * Direct bits don't use any probabilities. + * + * The distance slot value of 14 is for distances 128-191. + */ +#define DIST_MODEL_END 14 + +/* Distance slots that indicate a distance <= 127. */ +#define FULL_DISTANCES_BITS (DIST_MODEL_END / 2) +#define FULL_DISTANCES (1 << FULL_DISTANCES_BITS) + +/* + * For match distances greater than 127, only the highest two bits and the + * lowest four bits (alignment) is encoded using probabilities. + */ +#define ALIGN_BITS 4 +#define ALIGN_SIZE (1 << ALIGN_BITS) +#define ALIGN_MASK (ALIGN_SIZE - 1) + +/* Total number of all probability variables */ +#define PROBS_TOTAL (1846 + LITERAL_CODERS_MAX * LITERAL_CODER_SIZE) + +/* + * LZMA remembers the four most recent match distances. Reusing these + * distances tends to take less space than re-encoding the actual + * distance value. + */ +#define REPS 4 + +#endif diff --git a/lib/xz/xz_private.h b/lib/xz/xz_private.h new file mode 100644 index 000000000000..a65633e06962 --- /dev/null +++ b/lib/xz/xz_private.h @@ -0,0 +1,156 @@ +/* + * Private includes and definitions + * + * Author: Lasse Collin <lasse.collin@tukaani.org> + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#ifndef XZ_PRIVATE_H +#define XZ_PRIVATE_H + +#ifdef __KERNEL__ +# include <linux/xz.h> +# include <asm/byteorder.h> +# include <asm/unaligned.h> + /* XZ_PREBOOT may be defined only via decompress_unxz.c. */ +# ifndef XZ_PREBOOT +# include <linux/slab.h> +# include <linux/vmalloc.h> +# include <linux/string.h> +# ifdef CONFIG_XZ_DEC_X86 +# define XZ_DEC_X86 +# endif +# ifdef CONFIG_XZ_DEC_POWERPC +# define XZ_DEC_POWERPC +# endif +# ifdef CONFIG_XZ_DEC_IA64 +# define XZ_DEC_IA64 +# endif +# ifdef CONFIG_XZ_DEC_ARM +# define XZ_DEC_ARM +# endif +# ifdef CONFIG_XZ_DEC_ARMTHUMB +# define XZ_DEC_ARMTHUMB +# endif +# ifdef CONFIG_XZ_DEC_SPARC +# define XZ_DEC_SPARC +# endif +# define memeq(a, b, size) (memcmp(a, b, size) == 0) +# define memzero(buf, size) memset(buf, 0, size) +# endif +# define get_le32(p) le32_to_cpup((const uint32_t *)(p)) +#else + /* + * For userspace builds, use a separate header to define the required + * macros and functions. This makes it easier to adapt the code into + * different environments and avoids clutter in the Linux kernel tree. + */ +# include "xz_config.h" +#endif + +/* If no specific decoding mode is requested, enable support for all modes. */ +#if !defined(XZ_DEC_SINGLE) && !defined(XZ_DEC_PREALLOC) \ + && !defined(XZ_DEC_DYNALLOC) +# define XZ_DEC_SINGLE +# define XZ_DEC_PREALLOC +# define XZ_DEC_DYNALLOC +#endif + +/* + * The DEC_IS_foo(mode) macros are used in "if" statements. If only some + * of the supported modes are enabled, these macros will evaluate to true or + * false at compile time and thus allow the compiler to omit unneeded code. + */ +#ifdef XZ_DEC_SINGLE +# define DEC_IS_SINGLE(mode) ((mode) == XZ_SINGLE) +#else +# define DEC_IS_SINGLE(mode) (false) +#endif + +#ifdef XZ_DEC_PREALLOC +# define DEC_IS_PREALLOC(mode) ((mode) == XZ_PREALLOC) +#else +# define DEC_IS_PREALLOC(mode) (false) +#endif + +#ifdef XZ_DEC_DYNALLOC +# define DEC_IS_DYNALLOC(mode) ((mode) == XZ_DYNALLOC) +#else +# define DEC_IS_DYNALLOC(mode) (false) +#endif + +#if !defined(XZ_DEC_SINGLE) +# define DEC_IS_MULTI(mode) (true) +#elif defined(XZ_DEC_PREALLOC) || defined(XZ_DEC_DYNALLOC) +# define DEC_IS_MULTI(mode) ((mode) != XZ_SINGLE) +#else +# define DEC_IS_MULTI(mode) (false) +#endif + +/* + * If any of the BCJ filter decoders are wanted, define XZ_DEC_BCJ. + * XZ_DEC_BCJ is used to enable generic support for BCJ decoders. + */ +#ifndef XZ_DEC_BCJ +# if defined(XZ_DEC_X86) || defined(XZ_DEC_POWERPC) \ + || defined(XZ_DEC_IA64) || defined(XZ_DEC_ARM) \ + || defined(XZ_DEC_ARM) || defined(XZ_DEC_ARMTHUMB) \ + || defined(XZ_DEC_SPARC) +# define XZ_DEC_BCJ +# endif +#endif + +/* + * Allocate memory for LZMA2 decoder. xz_dec_lzma2_reset() must be used + * before calling xz_dec_lzma2_run(). + */ +XZ_EXTERN struct xz_dec_lzma2 *xz_dec_lzma2_create(enum xz_mode mode, + uint32_t dict_max); + +/* + * Decode the LZMA2 properties (one byte) and reset the decoder. Return + * XZ_OK on success, XZ_MEMLIMIT_ERROR if the preallocated dictionary is not + * big enough, and XZ_OPTIONS_ERROR if props indicates something that this + * decoder doesn't support. + */ +XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, + uint8_t props); + +/* Decode raw LZMA2 stream from b->in to b->out. */ +XZ_EXTERN enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s, + struct xz_buf *b); + +/* Free the memory allocated for the LZMA2 decoder. */ +XZ_EXTERN void xz_dec_lzma2_end(struct xz_dec_lzma2 *s); + +#ifdef XZ_DEC_BCJ +/* + * Allocate memory for BCJ decoders. xz_dec_bcj_reset() must be used before + * calling xz_dec_bcj_run(). + */ +XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool single_call); + +/* + * Decode the Filter ID of a BCJ filter. This implementation doesn't + * support custom start offsets, so no decoding of Filter Properties + * is needed. Returns XZ_OK if the given Filter ID is supported. + * Otherwise XZ_OPTIONS_ERROR is returned. + */ +XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id); + +/* + * Decode raw BCJ + LZMA2 stream. This must be used only if there actually is + * a BCJ filter in the chain. If the chain has only LZMA2, xz_dec_lzma2_run() + * must be called directly. + */ +XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s, + struct xz_dec_lzma2 *lzma2, + struct xz_buf *b); + +/* Free the memory allocated for the BCJ filters. */ +#define xz_dec_bcj_end(s) kfree(s) +#endif + +#endif diff --git a/lib/xz/xz_stream.h b/lib/xz/xz_stream.h new file mode 100644 index 000000000000..66cb5a7055ec --- /dev/null +++ b/lib/xz/xz_stream.h @@ -0,0 +1,62 @@ +/* + * Definitions for handling the .xz file format + * + * Author: Lasse Collin <lasse.collin@tukaani.org> + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#ifndef XZ_STREAM_H +#define XZ_STREAM_H + +#if defined(__KERNEL__) && !XZ_INTERNAL_CRC32 +# include <linux/crc32.h> +# undef crc32 +# define xz_crc32(buf, size, crc) \ + (~crc32_le(~(uint32_t)(crc), buf, size)) +#endif + +/* + * See the .xz file format specification at + * http://tukaani.org/xz/xz-file-format.txt + * to understand the container format. + */ + +#define STREAM_HEADER_SIZE 12 + +#define HEADER_MAGIC "\3757zXZ" +#define HEADER_MAGIC_SIZE 6 + +#define FOOTER_MAGIC "YZ" +#define FOOTER_MAGIC_SIZE 2 + +/* + * Variable-length integer can hold a 63-bit unsigned integer or a special + * value indicating that the value is unknown. + * + * Experimental: vli_type can be defined to uint32_t to save a few bytes + * in code size (no effect on speed). Doing so limits the uncompressed and + * compressed size of the file to less than 256 MiB and may also weaken + * error detection slightly. + */ +typedef uint64_t vli_type; + +#define VLI_MAX ((vli_type)-1 / 2) +#define VLI_UNKNOWN ((vli_type)-1) + +/* Maximum encoded size of a VLI */ +#define VLI_BYTES_MAX (sizeof(vli_type) * 8 / 7) + +/* Integrity Check types */ +enum xz_check { + XZ_CHECK_NONE = 0, + XZ_CHECK_CRC32 = 1, + XZ_CHECK_CRC64 = 4, + XZ_CHECK_SHA256 = 10 +}; + +/* Maximum possible Check ID */ +#define XZ_CHECK_MAX 15 + +#endif diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c index 215447c55261..2c13ecc5bb2c 100644 --- a/lib/zlib_inflate/inffast.c +++ b/lib/zlib_inflate/inffast.c @@ -8,21 +8,6 @@ #include "inflate.h" #include "inffast.h" -/* Only do the unaligned "Faster" variant when - * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is set - * - * On powerpc, it won't be as we don't include autoconf.h - * automatically for the boot wrapper, which is intended as - * we run in an environment where we may not be able to deal - * with (even rare) alignment faults. In addition, we do not - * define __KERNEL__ for arch/powerpc/boot unlike x86 - */ - -#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS -#include <asm/unaligned.h> -#include <asm/byteorder.h> -#endif - #ifndef ASMINF /* Allow machine dependent optimization for post-increment or pre-increment. @@ -36,14 +21,31 @@ - Pentium III (Anderson) - M68060 (Nikl) */ +union uu { + unsigned short us; + unsigned char b[2]; +}; + +/* Endian independed version */ +static inline unsigned short +get_unaligned16(const unsigned short *p) +{ + union uu mm; + unsigned char *b = (unsigned char *)p; + + mm.b[0] = b[0]; + mm.b[1] = b[1]; + return mm.us; +} + #ifdef POSTINC # define OFF 0 # define PUP(a) *(a)++ -# define UP_UNALIGNED(a) get_unaligned((a)++) +# define UP_UNALIGNED(a) get_unaligned16((a)++) #else # define OFF 1 # define PUP(a) *++(a) -# define UP_UNALIGNED(a) get_unaligned(++(a)) +# define UP_UNALIGNED(a) get_unaligned16(++(a)) #endif /* @@ -256,7 +258,6 @@ void inflate_fast(z_streamp strm, unsigned start) } } else { -#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS unsigned short *sout; unsigned long loops; @@ -274,22 +275,25 @@ void inflate_fast(z_streamp strm, unsigned start) sfrom = (unsigned short *)(from - OFF); loops = len >> 1; do +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + PUP(sout) = PUP(sfrom); +#else PUP(sout) = UP_UNALIGNED(sfrom); +#endif while (--loops); out = (unsigned char *)sout + OFF; from = (unsigned char *)sfrom + OFF; } else { /* dist == 1 or dist == 2 */ unsigned short pat16; - pat16 = *(sout-2+2*OFF); - if (dist == 1) -#if defined(__BIG_ENDIAN) - pat16 = (pat16 & 0xff) | ((pat16 & 0xff) << 8); -#elif defined(__LITTLE_ENDIAN) - pat16 = (pat16 & 0xff00) | ((pat16 & 0xff00) >> 8); -#else -#error __BIG_ENDIAN nor __LITTLE_ENDIAN is defined -#endif + pat16 = *(sout-1+OFF); + if (dist == 1) { + union uu mm; + /* copy one char pattern to both bytes */ + mm.us = pat16; + mm.b[0] = mm.b[1]; + pat16 = mm.us; + } loops = len >> 1; do PUP(sout) = pat16; @@ -298,20 +302,6 @@ void inflate_fast(z_streamp strm, unsigned start) } if (len & 1) PUP(out) = PUP(from); -#else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ - from = out - dist; /* copy direct from output */ - do { /* minimum length is three */ - PUP(out) = PUP(from); - PUP(out) = PUP(from); - PUP(out) = PUP(from); - len -= 3; - } while (len > 2); - if (len) { - PUP(out) = PUP(from); - if (len > 1) - PUP(out) = PUP(from); - } -#endif /* !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ } } else if ((op & 64) == 0) { /* 2nd level distance code */ |