diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-28 11:35:40 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-28 11:35:40 -0700 |
commit | dad4f140edaa3f6bb452b6913d41af1ffd672e45 (patch) | |
tree | 1c0ebdcdfcdfb4ec9af7810c5ad9bae0f791ff5c /include | |
parent | 69d5b97c597307773fe6c59775a5d5a88bb7e6b3 (diff) | |
parent | 3a08cd52c37c793ffc199f6fc2ecfc368e284b2d (diff) | |
download | talos-op-linux-dad4f140edaa3f6bb452b6913d41af1ffd672e45.tar.gz talos-op-linux-dad4f140edaa3f6bb452b6913d41af1ffd672e45.zip |
Merge branch 'xarray' of git://git.infradead.org/users/willy/linux-dax
Pull XArray conversion from Matthew Wilcox:
"The XArray provides an improved interface to the radix tree data
structure, providing locking as part of the API, specifying GFP flags
at allocation time, eliminating preloading, less re-walking the tree,
more efficient iterations and not exposing RCU-protected pointers to
its users.
This patch set
1. Introduces the XArray implementation
2. Converts the pagecache to use it
3. Converts memremap to use it
The page cache is the most complex and important user of the radix
tree, so converting it was most important. Converting the memremap
code removes the only other user of the multiorder code, which allows
us to remove the radix tree code that supported it.
I have 40+ followup patches to convert many other users of the radix
tree over to the XArray, but I'd like to get this part in first. The
other conversions haven't been in linux-next and aren't suitable for
applying yet, but you can see them in the xarray-conv branch if you're
interested"
* 'xarray' of git://git.infradead.org/users/willy/linux-dax: (90 commits)
radix tree: Remove multiorder support
radix tree test: Convert multiorder tests to XArray
radix tree tests: Convert item_delete_rcu to XArray
radix tree tests: Convert item_kill_tree to XArray
radix tree tests: Move item_insert_order
radix tree test suite: Remove multiorder benchmarking
radix tree test suite: Remove __item_insert
memremap: Convert to XArray
xarray: Add range store functionality
xarray: Move multiorder_check to in-kernel tests
xarray: Move multiorder_shrink to kernel tests
xarray: Move multiorder account test in-kernel
radix tree test suite: Convert iteration test to XArray
radix tree test suite: Convert tag_tagged_items to XArray
radix tree: Remove radix_tree_clear_tags
radix tree: Remove radix_tree_maybe_preload_order
radix tree: Remove split/join code
radix tree: Remove radix_tree_update_node_t
page cache: Finish XArray conversion
dax: Convert page fault handlers to XArray
...
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/fs.h | 63 | ||||
-rw-r--r-- | include/linux/idr.h | 18 | ||||
-rw-r--r-- | include/linux/pagemap.h | 10 | ||||
-rw-r--r-- | include/linux/pagevec.h | 8 | ||||
-rw-r--r-- | include/linux/radix-tree.h | 178 | ||||
-rw-r--r-- | include/linux/swap.h | 22 | ||||
-rw-r--r-- | include/linux/swapops.h | 19 | ||||
-rw-r--r-- | include/linux/xarray.h | 1293 |
8 files changed, 1390 insertions, 221 deletions
diff --git a/include/linux/fs.h b/include/linux/fs.h index 897eae8faee1..771341470bce 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -403,24 +403,40 @@ int pagecache_write_end(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); +/** + * struct address_space - Contents of a cacheable, mappable object. + * @host: Owner, either the inode or the block_device. + * @i_pages: Cached pages. + * @gfp_mask: Memory allocation flags to use for allocating pages. + * @i_mmap_writable: Number of VM_SHARED mappings. + * @i_mmap: Tree of private and shared mappings. + * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. + * @nrpages: Number of page entries, protected by the i_pages lock. + * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock. + * @writeback_index: Writeback starts here. + * @a_ops: Methods. + * @flags: Error bits and flags (AS_*). + * @wb_err: The most recent error which has occurred. + * @private_lock: For use by the owner of the address_space. + * @private_list: For use by the owner of the address_space. + * @private_data: For use by the owner of the address_space. + */ struct address_space { - struct inode *host; /* owner: inode, block_device */ - struct radix_tree_root i_pages; /* cached pages */ - atomic_t i_mmap_writable;/* count VM_SHARED mappings */ - struct rb_root_cached i_mmap; /* tree of private and shared mappings */ - struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ - /* Protected by the i_pages lock */ - unsigned long nrpages; /* number of total pages */ - /* number of shadow or DAX exceptional entries */ + struct inode *host; + struct xarray i_pages; + gfp_t gfp_mask; + atomic_t i_mmap_writable; + struct rb_root_cached i_mmap; + struct rw_semaphore i_mmap_rwsem; + unsigned long nrpages; unsigned long nrexceptional; - pgoff_t writeback_index;/* writeback starts here */ - const struct address_space_operations *a_ops; /* methods */ - unsigned long flags; /* error bits */ - spinlock_t private_lock; /* for use by the address_space */ - gfp_t gfp_mask; /* implicit gfp mask for allocations */ - struct list_head private_list; /* for use by the address_space */ - void *private_data; /* ditto */ + pgoff_t writeback_index; + const struct address_space_operations *a_ops; + unsigned long flags; errseq_t wb_err; + spinlock_t private_lock; + struct list_head private_list; + void *private_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but @@ -467,15 +483,18 @@ struct block_device { struct mutex bd_fsfreeze_mutex; } __randomize_layout; +/* XArray tags, for tagging dirty and writeback pages in the pagecache. */ +#define PAGECACHE_TAG_DIRTY XA_MARK_0 +#define PAGECACHE_TAG_WRITEBACK XA_MARK_1 +#define PAGECACHE_TAG_TOWRITE XA_MARK_2 + /* - * Radix-tree tags, for tagging dirty and writeback pages within the pagecache - * radix trees + * Returns true if any of the pages in the mapping are marked with the tag. */ -#define PAGECACHE_TAG_DIRTY 0 -#define PAGECACHE_TAG_WRITEBACK 1 -#define PAGECACHE_TAG_TOWRITE 2 - -int mapping_tagged(struct address_space *mapping, int tag); +static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag) +{ + return xa_marked(&mapping->i_pages, tag); +} static inline void i_mmap_lock_write(struct address_space *mapping) { diff --git a/include/linux/idr.h b/include/linux/idr.h index 3ec8628ce17f..60daf34b625d 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -214,8 +214,7 @@ static inline void idr_preload_end(void) ++id, (entry) = idr_get_next((idr), &(id))) /* - * IDA - IDR based id allocator, use when translation from id to - * pointer isn't necessary. + * IDA - ID Allocator, use when translation from id to pointer isn't necessary. */ #define IDA_CHUNK_SIZE 128 /* 128 bytes per chunk */ #define IDA_BITMAP_LONGS (IDA_CHUNK_SIZE / sizeof(long)) @@ -225,14 +224,14 @@ struct ida_bitmap { unsigned long bitmap[IDA_BITMAP_LONGS]; }; -DECLARE_PER_CPU(struct ida_bitmap *, ida_bitmap); - struct ida { - struct radix_tree_root ida_rt; + struct xarray xa; }; +#define IDA_INIT_FLAGS (XA_FLAGS_LOCK_IRQ | XA_FLAGS_ALLOC) + #define IDA_INIT(name) { \ - .ida_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER | GFP_NOWAIT), \ + .xa = XARRAY_INIT(name, IDA_INIT_FLAGS) \ } #define DEFINE_IDA(name) struct ida name = IDA_INIT(name) @@ -292,7 +291,7 @@ static inline int ida_alloc_max(struct ida *ida, unsigned int max, gfp_t gfp) static inline void ida_init(struct ida *ida) { - INIT_RADIX_TREE(&ida->ida_rt, IDR_RT_MARKER | GFP_NOWAIT); + xa_init_flags(&ida->xa, IDA_INIT_FLAGS); } #define ida_simple_get(ida, start, end, gfp) \ @@ -301,9 +300,6 @@ static inline void ida_init(struct ida *ida) static inline bool ida_is_empty(const struct ida *ida) { - return radix_tree_empty(&ida->ida_rt); + return xa_empty(&ida->xa); } - -/* in lib/radix-tree.c */ -int ida_pre_get(struct ida *ida, gfp_t gfp_mask); #endif /* __IDR_H__ */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b1bd2186e6d2..226f96f0dee0 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -241,9 +241,9 @@ static inline gfp_t readahead_gfp_mask(struct address_space *x) typedef int filler_t(void *, struct page *); -pgoff_t page_cache_next_hole(struct address_space *mapping, +pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan); -pgoff_t page_cache_prev_hole(struct address_space *mapping, +pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan); #define FGP_ACCESSED 0x00000001 @@ -363,17 +363,17 @@ static inline unsigned find_get_pages(struct address_space *mapping, unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, - pgoff_t end, int tag, unsigned int nr_pages, + pgoff_t end, xa_mark_t tag, unsigned int nr_pages, struct page **pages); static inline unsigned find_get_pages_tag(struct address_space *mapping, - pgoff_t *index, int tag, unsigned int nr_pages, + pgoff_t *index, xa_mark_t tag, unsigned int nr_pages, struct page **pages) { return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag, nr_pages, pages); } unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, - int tag, unsigned int nr_entries, + xa_mark_t tag, unsigned int nr_entries, struct page **entries, pgoff_t *indices); struct page *grab_cache_page_write_begin(struct address_space *mapping, diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 6dc456ac6136..081d934eda64 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -9,6 +9,8 @@ #ifndef _LINUX_PAGEVEC_H #define _LINUX_PAGEVEC_H +#include <linux/xarray.h> + /* 15 pointers + header align the pagevec structure to a power of two */ #define PAGEVEC_SIZE 15 @@ -40,12 +42,12 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec, unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag); + xa_mark_t tag); unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag, unsigned max_pages); + xa_mark_t tag, unsigned max_pages); static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, int tag) + struct address_space *mapping, pgoff_t *index, xa_mark_t tag) { return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag); } diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 34149e8b5f73..06c4c7a6c09c 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -28,34 +28,30 @@ #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/types.h> +#include <linux/xarray.h> + +/* Keep unconverted code working */ +#define radix_tree_root xarray +#define radix_tree_node xa_node /* * The bottom two bits of the slot determine how the remaining bits in the * slot are interpreted: * * 00 - data pointer - * 01 - internal entry - * 10 - exceptional entry - * 11 - this bit combination is currently unused/reserved + * 10 - internal entry + * x1 - value entry * * The internal entry may be a pointer to the next level in the tree, a * sibling entry, or an indicator that the entry in this slot has been moved * to another location in the tree and the lookup should be restarted. While * NULL fits the 'data pointer' pattern, it means that there is no entry in * the tree for this index (no matter what level of the tree it is found at). - * This means that you cannot store NULL in the tree as a value for the index. + * This means that storing a NULL entry in the tree is the same as deleting + * the entry from the tree. */ #define RADIX_TREE_ENTRY_MASK 3UL -#define RADIX_TREE_INTERNAL_NODE 1UL - -/* - * Most users of the radix tree store pointers but shmem/tmpfs stores swap - * entries in the same tree. They are marked as exceptional entries to - * distinguish them from pointers to struct page. - * EXCEPTIONAL_ENTRY tests the bit, EXCEPTIONAL_SHIFT shifts content past it. - */ -#define RADIX_TREE_EXCEPTIONAL_ENTRY 2 -#define RADIX_TREE_EXCEPTIONAL_SHIFT 2 +#define RADIX_TREE_INTERNAL_NODE 2UL static inline bool radix_tree_is_internal_node(void *ptr) { @@ -65,75 +61,32 @@ static inline bool radix_tree_is_internal_node(void *ptr) /*** radix-tree API starts here ***/ -#define RADIX_TREE_MAX_TAGS 3 - -#ifndef RADIX_TREE_MAP_SHIFT -#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) -#endif - +#define RADIX_TREE_MAP_SHIFT XA_CHUNK_SHIFT #define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) #define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) -#define RADIX_TREE_TAG_LONGS \ - ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) +#define RADIX_TREE_MAX_TAGS XA_MAX_MARKS +#define RADIX_TREE_TAG_LONGS XA_MARK_LONGS #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) -/* - * @count is the count of every non-NULL element in the ->slots array - * whether that is an exceptional entry, a retry entry, a user pointer, - * a sibling entry or a pointer to the next level of the tree. - * @exceptional is the count of every element in ->slots which is - * either radix_tree_exceptional_entry() or is a sibling entry for an - * exceptional entry. - */ -struct radix_tree_node { - unsigned char shift; /* Bits remaining in each slot */ - unsigned char offset; /* Slot offset in parent */ - unsigned char count; /* Total entry count */ - unsigned char exceptional; /* Exceptional entry count */ - struct radix_tree_node *parent; /* Used when ascending tree */ - struct radix_tree_root *root; /* The tree we belong to */ - union { - struct list_head private_list; /* For tree user */ - struct rcu_head rcu_head; /* Used when freeing node */ - }; - void __rcu *slots[RADIX_TREE_MAP_SIZE]; - unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; -}; - -/* The IDR tag is stored in the low bits of the GFP flags */ +/* The IDR tag is stored in the low bits of xa_flags */ #define ROOT_IS_IDR ((__force gfp_t)4) -/* The top bits of gfp_mask are used to store the root tags */ +/* The top bits of xa_flags are used to store the root tags */ #define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT) -struct radix_tree_root { - spinlock_t xa_lock; - gfp_t gfp_mask; - struct radix_tree_node __rcu *rnode; -}; - -#define RADIX_TREE_INIT(name, mask) { \ - .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \ - .gfp_mask = (mask), \ - .rnode = NULL, \ -} +#define RADIX_TREE_INIT(name, mask) XARRAY_INIT(name, mask) #define RADIX_TREE(name, mask) \ struct radix_tree_root name = RADIX_TREE_INIT(name, mask) -#define INIT_RADIX_TREE(root, mask) \ -do { \ - spin_lock_init(&(root)->xa_lock); \ - (root)->gfp_mask = (mask); \ - (root)->rnode = NULL; \ -} while (0) +#define INIT_RADIX_TREE(root, mask) xa_init_flags(root, mask) static inline bool radix_tree_empty(const struct radix_tree_root *root) { - return root->rnode == NULL; + return root->xa_head == NULL; } /** @@ -143,7 +96,6 @@ static inline bool radix_tree_empty(const struct radix_tree_root *root) * @next_index: one beyond the last index for this chunk * @tags: bit-mask for tag-iterating * @node: node that contains current slot - * @shift: shift for the node that holds our slots * * This radix tree iterator works in terms of "chunks" of slots. A chunk is a * subinterval of slots contained within one radix tree leaf node. It is @@ -157,20 +109,8 @@ struct radix_tree_iter { unsigned long next_index; unsigned long tags; struct radix_tree_node *node; -#ifdef CONFIG_RADIX_TREE_MULTIORDER - unsigned int shift; -#endif }; -static inline unsigned int iter_shift(const struct radix_tree_iter *iter) -{ -#ifdef CONFIG_RADIX_TREE_MULTIORDER - return iter->shift; -#else - return 0; -#endif -} - /** * Radix-tree synchronization * @@ -194,12 +134,11 @@ static inline unsigned int iter_shift(const struct radix_tree_iter *iter) * radix_tree_lookup_slot * radix_tree_tag_get * radix_tree_gang_lookup - * radix_tree_gang_lookup_slot * radix_tree_gang_lookup_tag * radix_tree_gang_lookup_tag_slot * radix_tree_tagged * - * The first 8 functions are able to be called locklessly, using RCU. The + * The first 7 functions are able to be called locklessly, using RCU. The * caller must ensure calls to these functions are made within rcu_read_lock() * regions. Other readers (lock-free or otherwise) and modifications may be * running concurrently. @@ -269,17 +208,6 @@ static inline int radix_tree_deref_retry(void *arg) } /** - * radix_tree_exceptional_entry - radix_tree_deref_slot gave exceptional entry? - * @arg: value returned by radix_tree_deref_slot - * Returns: 0 if well-aligned pointer, non-0 if exceptional entry. - */ -static inline int radix_tree_exceptional_entry(void *arg) -{ - /* Not unlikely because radix_tree_exception often tested first */ - return (unsigned long)arg & RADIX_TREE_EXCEPTIONAL_ENTRY; -} - -/** * radix_tree_exception - radix_tree_deref_slot returned either exception? * @arg: value returned by radix_tree_deref_slot * Returns: 0 if well-aligned pointer, non-0 if either kind of exception. @@ -289,47 +217,28 @@ static inline int radix_tree_exception(void *arg) return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK); } -int __radix_tree_create(struct radix_tree_root *, unsigned long index, - unsigned order, struct radix_tree_node **nodep, - void __rcu ***slotp); -int __radix_tree_insert(struct radix_tree_root *, unsigned long index, - unsigned order, void *); -static inline int radix_tree_insert(struct radix_tree_root *root, - unsigned long index, void *entry) -{ - return __radix_tree_insert(root, index, 0, entry); -} +int radix_tree_insert(struct radix_tree_root *, unsigned long index, + void *); void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index, struct radix_tree_node **nodep, void __rcu ***slotp); void *radix_tree_lookup(const struct radix_tree_root *, unsigned long); void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *, unsigned long index); -typedef void (*radix_tree_update_node_t)(struct radix_tree_node *); void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *, - void __rcu **slot, void *entry, - radix_tree_update_node_t update_node); + void __rcu **slot, void *entry); void radix_tree_iter_replace(struct radix_tree_root *, const struct radix_tree_iter *, void __rcu **slot, void *entry); void radix_tree_replace_slot(struct radix_tree_root *, void __rcu **slot, void *entry); -void __radix_tree_delete_node(struct radix_tree_root *, - struct radix_tree_node *, - radix_tree_update_node_t update_node); void radix_tree_iter_delete(struct radix_tree_root *, struct radix_tree_iter *iter, void __rcu **slot); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); -void radix_tree_clear_tags(struct radix_tree_root *, struct radix_tree_node *, - void __rcu **slot); unsigned int radix_tree_gang_lookup(const struct radix_tree_root *, void **results, unsigned long first_index, unsigned int max_items); -unsigned int radix_tree_gang_lookup_slot(const struct radix_tree_root *, - void __rcu ***results, unsigned long *indices, - unsigned long first_index, unsigned int max_items); int radix_tree_preload(gfp_t gfp_mask); int radix_tree_maybe_preload(gfp_t gfp_mask); -int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order); void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *, unsigned long index, unsigned int tag); @@ -337,8 +246,6 @@ void *radix_tree_tag_clear(struct radix_tree_root *, unsigned long index, unsigned int tag); int radix_tree_tag_get(const struct radix_tree_root *, unsigned long index, unsigned int tag); -void radix_tree_iter_tag_set(struct radix_tree_root *, - const struct radix_tree_iter *iter, unsigned int tag); void radix_tree_iter_tag_clear(struct radix_tree_root *, const struct radix_tree_iter *iter, unsigned int tag); unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *, @@ -354,12 +261,6 @@ static inline void radix_tree_preload_end(void) preempt_enable(); } -int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t); -int radix_tree_split(struct radix_tree_root *, unsigned long index, - unsigned new_order); -int radix_tree_join(struct radix_tree_root *, unsigned long index, - unsigned new_order, void *); - void __rcu **idr_get_free(struct radix_tree_root *root, struct radix_tree_iter *iter, gfp_t gfp, unsigned long max); @@ -465,7 +366,7 @@ void __rcu **radix_tree_iter_retry(struct radix_tree_iter *iter) static inline unsigned long __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots) { - return iter->index + (slots << iter_shift(iter)); + return iter->index + slots; } /** @@ -490,21 +391,9 @@ void __rcu **__must_check radix_tree_iter_resume(void __rcu **slot, static __always_inline long radix_tree_chunk_size(struct radix_tree_iter *iter) { - return (iter->next_index - iter->index) >> iter_shift(iter); + return iter->next_index - iter->index; } -#ifdef CONFIG_RADIX_TREE_MULTIORDER -void __rcu **__radix_tree_next_slot(void __rcu **slot, - struct radix_tree_iter *iter, unsigned flags); -#else -/* Can't happen without sibling entries, but the compiler can't tell that */ -static inline void __rcu **__radix_tree_next_slot(void __rcu **slot, - struct radix_tree_iter *iter, unsigned flags) -{ - return slot; -} -#endif - /** * radix_tree_next_slot - find next slot in chunk * @@ -563,8 +452,6 @@ static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot, return NULL; found: - if (unlikely(radix_tree_is_internal_node(rcu_dereference_raw(*slot)))) - return __radix_tree_next_slot(slot, iter, flags); return slot; } @@ -584,23 +471,6 @@ static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot, slot = radix_tree_next_slot(slot, iter, 0)) /** - * radix_tree_for_each_contig - iterate over contiguous slots - * - * @slot: the void** variable for pointer to slot - * @root: the struct radix_tree_root pointer - * @iter: the struct radix_tree_iter pointer - * @start: iteration starting index - * - * @slot points to radix tree slot, @iter->index contains its index. - */ -#define radix_tree_for_each_contig(slot, root, iter, start) \ - for (slot = radix_tree_iter_init(iter, start) ; \ - slot || (slot = radix_tree_next_chunk(root, iter, \ - RADIX_TREE_ITER_CONTIG)) ; \ - slot = radix_tree_next_slot(slot, iter, \ - RADIX_TREE_ITER_CONTIG)) - -/** * radix_tree_for_each_tagged - iterate over tagged slots * * @slot: the void** variable for pointer to slot diff --git a/include/linux/swap.h b/include/linux/swap.h index 38195f5c96b1..d8a07a4f171d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -300,17 +300,12 @@ void *workingset_eviction(struct address_space *mapping, struct page *page); void workingset_refault(struct page *page, void *shadow); void workingset_activation(struct page *page); -/* Do not use directly, use workingset_lookup_update */ -void workingset_update_node(struct radix_tree_node *node); - -/* Returns workingset_update_node() if the mapping has shadow entries. */ -#define workingset_lookup_update(mapping) \ -({ \ - radix_tree_update_node_t __helper = workingset_update_node; \ - if (dax_mapping(mapping) || shmem_mapping(mapping)) \ - __helper = NULL; \ - __helper; \ -}) +/* Only track the nodes of mappings with shadow entries */ +void workingset_update_node(struct xa_node *node); +#define mapping_set_update(xas, mapping) do { \ + if (!dax_mapping(mapping) && !shmem_mapping(mapping)) \ + xas_set_update(xas, workingset_update_node); \ +} while (0) /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; @@ -409,7 +404,7 @@ extern void show_swap_cache_info(void); extern int add_to_swap(struct page *page); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); extern int __add_to_swap_cache(struct page *page, swp_entry_t entry); -extern void __delete_from_swap_cache(struct page *); +extern void __delete_from_swap_cache(struct page *, swp_entry_t entry); extern void delete_from_swap_cache(struct page *); extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct page **, int); @@ -563,7 +558,8 @@ static inline int add_to_swap_cache(struct page *page, swp_entry_t entry, return -1; } -static inline void __delete_from_swap_cache(struct page *page) +static inline void __delete_from_swap_cache(struct page *page, + swp_entry_t entry) { } diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 22af9d8a84ae..4d961668e5fc 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -18,9 +18,8 @@ * * swp_entry_t's are *never* stored anywhere in their arch-dependent format. */ -#define SWP_TYPE_SHIFT(e) ((sizeof(e.val) * 8) - \ - (MAX_SWAPFILES_SHIFT + RADIX_TREE_EXCEPTIONAL_SHIFT)) -#define SWP_OFFSET_MASK(e) ((1UL << SWP_TYPE_SHIFT(e)) - 1) +#define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT) +#define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1) /* * Store a type+offset into a swp_entry_t in an arch-independent format @@ -29,8 +28,7 @@ static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset) { swp_entry_t ret; - ret.val = (type << SWP_TYPE_SHIFT(ret)) | - (offset & SWP_OFFSET_MASK(ret)); + ret.val = (type << SWP_TYPE_SHIFT) | (offset & SWP_OFFSET_MASK); return ret; } @@ -40,7 +38,7 @@ static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset) */ static inline unsigned swp_type(swp_entry_t entry) { - return (entry.val >> SWP_TYPE_SHIFT(entry)); + return (entry.val >> SWP_TYPE_SHIFT); } /* @@ -49,7 +47,7 @@ static inline unsigned swp_type(swp_entry_t entry) */ static inline pgoff_t swp_offset(swp_entry_t entry) { - return entry.val & SWP_OFFSET_MASK(entry); + return entry.val & SWP_OFFSET_MASK; } #ifdef CONFIG_MMU @@ -90,16 +88,13 @@ static inline swp_entry_t radix_to_swp_entry(void *arg) { swp_entry_t entry; - entry.val = (unsigned long)arg >> RADIX_TREE_EXCEPTIONAL_SHIFT; + entry.val = xa_to_value(arg); return entry; } static inline void *swp_to_radix_entry(swp_entry_t entry) { - unsigned long value; - - value = entry.val << RADIX_TREE_EXCEPTIONAL_SHIFT; - return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY); + return xa_mk_value(entry.val); } #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 2dfc8006fe64..d9514928ddac 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -4,10 +4,432 @@ /* * eXtensible Arrays * Copyright (c) 2017 Microsoft Corporation - * Author: Matthew Wilcox <mawilcox@microsoft.com> + * Author: Matthew Wilcox <willy@infradead.org> + * + * See Documentation/core-api/xarray.rst for how to use the XArray. */ +#include <linux/bug.h> +#include <linux/compiler.h> +#include <linux/gfp.h> +#include <linux/kconfig.h> +#include <linux/kernel.h> +#include <linux/rcupdate.h> #include <linux/spinlock.h> +#include <linux/types.h> + +/* + * The bottom two bits of the entry determine how the XArray interprets + * the contents: + * + * 00: Pointer entry + * 10: Internal entry + * x1: Value entry or tagged pointer + * + * Attempting to store internal entries in the XArray is a bug. + * + * Most internal entries are pointers to the next node in the tree. + * The following internal entries have a special meaning: + * + * 0-62: Sibling entries + * 256: Zero entry + * 257: Retry entry + * + * Errors are also represented as internal entries, but use the negative + * space (-4094 to -2). They're never stored in the slots array; only + * returned by the normal API. + */ + +#define BITS_PER_XA_VALUE (BITS_PER_LONG - 1) + +/** + * xa_mk_value() - Create an XArray entry from an integer. + * @v: Value to store in XArray. + * + * Context: Any context. + * Return: An entry suitable for storing in the XArray. + */ +static inline void *xa_mk_value(unsigned long v) +{ + WARN_ON((long)v < 0); + return (void *)((v << 1) | 1); +} + +/** + * xa_to_value() - Get value stored in an XArray entry. + * @entry: XArray entry. + * + * Context: Any context. + * Return: The value stored in the XArray entry. + */ +static inline unsigned long xa_to_value(const void *entry) +{ + return (unsigned long)entry >> 1; +} + +/** + * xa_is_value() - Determine if an entry is a value. + * @entry: XArray entry. + * + * Context: Any context. + * Return: True if the entry is a value, false if it is a pointer. + */ +static inline bool xa_is_value(const void *entry) +{ + return (unsigned long)entry & 1; +} + +/** + * xa_tag_pointer() - Create an XArray entry for a tagged pointer. + * @p: Plain pointer. + * @tag: Tag value (0, 1 or 3). + * + * If the user of the XArray prefers, they can tag their pointers instead + * of storing value entries. Three tags are available (0, 1 and 3). + * These are distinct from the xa_mark_t as they are not replicated up + * through the array and cannot be searched for. + * + * Context: Any context. + * Return: An XArray entry. + */ +static inline void *xa_tag_pointer(void *p, unsigned long tag) +{ + return (void *)((unsigned long)p | tag); +} + +/** + * xa_untag_pointer() - Turn an XArray entry into a plain pointer. + * @entry: XArray entry. + * + * If you have stored a tagged pointer in the XArray, call this function + * to get the untagged version of the pointer. + * + * Context: Any context. + * Return: A pointer. + */ +static inline void *xa_untag_pointer(void *entry) +{ + return (void *)((unsigned long)entry & ~3UL); +} + +/** + * xa_pointer_tag() - Get the tag stored in an XArray entry. + * @entry: XArray entry. + * + * If you have stored a tagged pointer in the XArray, call this function + * to get the tag of that pointer. + * + * Context: Any context. + * Return: A tag. + */ +static inline unsigned int xa_pointer_tag(void *entry) +{ + return (unsigned long)entry & 3UL; +} + +/* + * xa_mk_internal() - Create an internal entry. + * @v: Value to turn into an internal entry. + * + * Context: Any context. + * Return: An XArray internal entry corresponding to this value. + */ +static inline void *xa_mk_internal(unsigned long v) +{ + return (void *)((v << 2) | 2); +} + +/* + * xa_to_internal() - Extract the value from an internal entry. + * @entry: XArray entry. + * + * Context: Any context. + * Return: The value which was stored in the internal entry. + */ +static inline unsigned long xa_to_internal(const void *entry) +{ + return (unsigned long)entry >> 2; +} + +/* + * xa_is_internal() - Is the entry an internal entry? + * @entry: XArray entry. + * + * Context: Any context. + * Return: %true if the entry is an internal entry. + */ +static inline bool xa_is_internal(const void *entry) +{ + return ((unsigned long)entry & 3) == 2; +} + +/** + * xa_is_err() - Report whether an XArray operation returned an error + * @entry: Result from calling an XArray function + * + * If an XArray operation cannot complete an operation, it will return + * a special value indicating an error. This function tells you + * whether an error occurred; xa_err() tells you which error occurred. + * + * Context: Any context. + * Return: %true if the entry indicates an error. + */ +static inline bool xa_is_err(const void *entry) +{ + return unlikely(xa_is_internal(entry)); +} + +/** + * xa_err() - Turn an XArray result into an errno. + * @entry: Result from calling an XArray function. + * + * If an XArray operation cannot complete an operation, it will return + * a special pointer value which encodes an errno. This function extracts + * the errno from the pointer value, or returns 0 if the pointer does not + * represent an errno. + * + * Context: Any context. + * Return: A negative errno or 0. + */ +static inline int xa_err(void *entry) +{ + /* xa_to_internal() would not do sign extension. */ + if (xa_is_err(entry)) + return (long)entry >> 2; + return 0; +} + +typedef unsigned __bitwise xa_mark_t; +#define XA_MARK_0 ((__force xa_mark_t)0U) +#define XA_MARK_1 ((__force xa_mark_t)1U) +#define XA_MARK_2 ((__force xa_mark_t)2U) +#define XA_PRESENT ((__force xa_mark_t)8U) +#define XA_MARK_MAX XA_MARK_2 +#define XA_FREE_MARK XA_MARK_0 + +enum xa_lock_type { + XA_LOCK_IRQ = 1, + XA_LOCK_BH = 2, +}; + +/* + * Values for xa_flags. The radix tree stores its GFP flags in the xa_flags, + * and we remain compatible with that. + */ +#define XA_FLAGS_LOCK_IRQ ((__force gfp_t)XA_LOCK_IRQ) +#define XA_FLAGS_LOCK_BH ((__force gfp_t)XA_LOCK_BH) +#define XA_FLAGS_TRACK_FREE ((__force gfp_t)4U) +#define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \ + (__force unsigned)(mark))) + +#define XA_FLAGS_ALLOC (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK)) + +/** + * struct xarray - The anchor of the XArray. + * @xa_lock: Lock that protects the contents of the XArray. + * + * To use the xarray, define it statically or embed it in your data structure. + * It is a very small data structure, so it does not usually make sense to + * allocate it separately and keep a pointer to it in your data structure. + * + * You may use the xa_lock to protect your own data structures as well. + */ +/* + * If all of the entries in the array are NULL, @xa_head is a NULL pointer. + * If the only non-NULL entry in the array is at index 0, @xa_head is that + * entry. If any other entry in the array is non-NULL, @xa_head points + * to an @xa_node. + */ +struct xarray { + spinlock_t xa_lock; +/* private: The rest of the data structure is not to be used directly. */ + gfp_t xa_flags; + void __rcu * xa_head; +}; + +#define XARRAY_INIT(name, flags) { \ + .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \ + .xa_flags = flags, \ + .xa_head = NULL, \ +} + +/** + * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags. + * @name: A string that names your XArray. + * @flags: XA_FLAG values. + * + * This is intended for file scope definitions of XArrays. It declares + * and initialises an empty XArray with the chosen name and flags. It is + * equivalent to calling xa_init_flags() on the array, but it does the + * initialisation at compiletime instead of runtime. + */ +#define DEFINE_XARRAY_FLAGS(name, flags) \ + struct xarray name = XARRAY_INIT(name, flags) + +/** + * DEFINE_XARRAY() - Define an XArray. + * @name: A string that names your XArray. + * + * This is intended for file scope definitions of XArrays. It declares + * and initialises an empty XArray with the chosen name. It is equivalent + * to calling xa_init() on the array, but it does the initialisation at + * compiletime instead of runtime. + */ +#define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0) + +/** + * DEFINE_XARRAY_ALLOC() - Define an XArray which can allocate IDs. + * @name: A string that names your XArray. + * + * This is intended for file scope definitions of allocating XArrays. + * See also DEFINE_XARRAY(). + */ +#define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC) + +void xa_init_flags(struct xarray *, gfp_t flags); +void *xa_load(struct xarray *, unsigned long index); +void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); +void *xa_cmpxchg(struct xarray *, unsigned long index, + void *old, void *entry, gfp_t); +int xa_reserve(struct xarray *, unsigned long index, gfp_t); +void *xa_store_range(struct xarray *, unsigned long first, unsigned long last, + void *entry, gfp_t); +bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); +void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); +void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); +void *xa_find(struct xarray *xa, unsigned long *index, + unsigned long max, xa_mark_t) __attribute__((nonnull(2))); +void *xa_find_after(struct xarray *xa, unsigned long *index, + unsigned long max, xa_mark_t) __attribute__((nonnull(2))); +unsigned int xa_extract(struct xarray *, void **dst, unsigned long start, + unsigned long max, unsigned int n, xa_mark_t); +void xa_destroy(struct xarray *); + +/** + * xa_init() - Initialise an empty XArray. + * @xa: XArray. + * + * An empty XArray is full of NULL entries. + * + * Context: Any context. + */ +static inline void xa_init(struct xarray *xa) +{ + xa_init_flags(xa, 0); +} + +/** + * xa_empty() - Determine if an array has any present entries. + * @xa: XArray. + * + * Context: Any context. + * Return: %true if the array contains only NULL pointers. + */ +static inline bool xa_empty(const struct xarray *xa) +{ + return xa->xa_head == NULL; +} + +/** + * xa_marked() - Inquire whether any entry in this array has a mark set + * @xa: Array + * @mark: Mark value + * + * Context: Any context. + * Return: %true if any entry has this mark set. + */ +static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) +{ + return xa->xa_flags & XA_FLAGS_MARK(mark); +} + +/** + * xa_erase() - Erase this entry from the XArray. + * @xa: XArray. + * @index: Index of entry. + * + * This function is the equivalent of calling xa_store() with %NULL as + * the third argument. The XArray does not need to allocate memory, so + * the user does not need to provide GFP flags. + * + * Context: Process context. Takes and releases the xa_lock. + * Return: The entry which used to be at this index. + */ +static inline void *xa_erase(struct xarray *xa, unsigned long index) +{ + return xa_store(xa, index, NULL, 0); +} + +/** + * xa_insert() - Store this entry in the XArray unless another entry is + * already present. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * If you would rather see the existing entry in the array, use xa_cmpxchg(). + * This function is for users who don't care what the entry is, only that + * one is present. + * + * Context: Process context. Takes and releases the xa_lock. + * May sleep if the @gfp flags permit. + * Return: 0 if the store succeeded. -EEXIST if another entry was present. + * -ENOMEM if memory could not be allocated. + */ +static inline int xa_insert(struct xarray *xa, unsigned long index, + void *entry, gfp_t gfp) +{ + void *curr = xa_cmpxchg(xa, index, NULL, entry, gfp); + if (!curr) + return 0; + if (xa_is_err(curr)) + return xa_err(curr); + return -EEXIST; +} + +/** + * xa_release() - Release a reserved entry. + * @xa: XArray. + * @index: Index of entry. + * + * After calling xa_reserve(), you can call this function to release the + * reservation. If the entry at @index has been stored to, this function + * will do nothing. + */ +static inline void xa_release(struct xarray *xa, unsigned long index) +{ + xa_cmpxchg(xa, index, NULL, NULL, 0); +} + +/** + * xa_for_each() - Iterate over a portion of an XArray. + * @xa: XArray. + * @entry: Entry retrieved from array. + * @index: Index of @entry. + * @max: Maximum index to retrieve from array. + * @filter: Selection criterion. + * + * Initialise @index to the lowest index you want to retrieve from the + * array. During the iteration, @entry will have the value of the entry + * stored in @xa at @index. The iteration will skip all entries in the + * array which do not match @filter. You may modify @index during the + * iteration if you want to skip or reprocess indices. It is safe to modify + * the array during the iteration. At the end of the iteration, @entry will + * be set to NULL and @index will have a value less than or equal to max. + * + * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n). You have + * to handle your own locking with xas_for_each(), and if you have to unlock + * after each iteration, it will also end up being O(n.log(n)). xa_for_each() + * will spin if it hits a retry entry; if you intend to see retry entries, + * you should use the xas_for_each() iterator instead. The xas_for_each() + * iterator will expand into more inline code than xa_for_each(). + * + * Context: Any context. Takes and releases the RCU lock. + */ +#define xa_for_each(xa, entry, index, max, filter) \ + for (entry = xa_find(xa, &index, max, filter); entry; \ + entry = xa_find_after(xa, &index, max, filter)) #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) @@ -21,4 +443,873 @@ #define xa_unlock_irqrestore(xa, flags) \ spin_unlock_irqrestore(&(xa)->xa_lock, flags) +/* + * Versions of the normal API which require the caller to hold the + * xa_lock. If the GFP flags allow it, they will drop the lock to + * allocate memory, then reacquire it afterwards. These functions + * may also re-enable interrupts if the XArray flags indicate the + * locking should be interrupt safe. + */ +void *__xa_erase(struct xarray *, unsigned long index); +void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); +void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old, + void *entry, gfp_t); +int __xa_alloc(struct xarray *, u32 *id, u32 max, void *entry, gfp_t); +void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); +void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); + +/** + * __xa_insert() - Store this entry in the XArray unless another entry is + * already present. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * If you would rather see the existing entry in the array, use __xa_cmpxchg(). + * This function is for users who don't care what the entry is, only that + * one is present. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if the @gfp flags permit. + * Return: 0 if the store succeeded. -EEXIST if another entry was present. + * -ENOMEM if memory could not be allocated. + */ +static inline int __xa_insert(struct xarray *xa, unsigned long index, + void *entry, gfp_t gfp) +{ + void *curr = __xa_cmpxchg(xa, index, NULL, entry, gfp); + if (!curr) + return 0; + if (xa_is_err(curr)) + return xa_err(curr); + return -EEXIST; +} + +/** + * xa_erase_bh() - Erase this entry from the XArray. + * @xa: XArray. + * @index: Index of entry. + * + * This function is the equivalent of calling xa_store() with %NULL as + * the third argument. The XArray does not need to allocate memory, so + * the user does not need to provide GFP flags. + * + * Context: Process context. Takes and releases the xa_lock while + * disabling softirqs. + * Return: The entry which used to be at this index. + */ +static inline void *xa_erase_bh(struct xarray *xa, unsigned long index) +{ + void *entry; + + xa_lock_bh(xa); + entry = __xa_erase(xa, index); + xa_unlock_bh(xa); + + return entry; +} + +/** + * xa_erase_irq() - Erase this entry from the XArray. + * @xa: XArray. + * @index: Index of entry. + * + * This function is the equivalent of calling xa_store() with %NULL as + * the third argument. The XArray does not need to allocate memory, so + * the user does not need to provide GFP flags. + * + * Context: Process context. Takes and releases the xa_lock while + * disabling interrupts. + * Return: The entry which used to be at this index. + */ +static inline void *xa_erase_irq(struct xarray *xa, unsigned long index) +{ + void *entry; + + xa_lock_irq(xa); + entry = __xa_erase(xa, index); + xa_unlock_irq(xa); + + return entry; +} + +/** + * xa_alloc() - Find somewhere to store this entry in the XArray. + * @xa: XArray. + * @id: Pointer to ID. + * @max: Maximum ID to allocate (inclusive). + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * Allocates an unused ID in the range specified by @id and @max. + * Updates the @id pointer with the index, then stores the entry at that + * index. A concurrent lookup will not see an uninitialised @id. + * + * Context: Process context. Takes and releases the xa_lock. May sleep if + * the @gfp flags permit. + * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if + * there is no more space in the XArray. + */ +static inline int xa_alloc(struct xarray *xa, u32 *id, u32 max, void *entry, + gfp_t gfp) +{ + int err; + + xa_lock(xa); + err = __xa_alloc(xa, id, max, entry, gfp); + xa_unlock(xa); + + return err; +} + +/** + * xa_alloc_bh() - Find somewhere to store this entry in the XArray. + * @xa: XArray. + * @id: Pointer to ID. + * @max: Maximum ID to allocate (inclusive). + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * Allocates an unused ID in the range specified by @id and @max. + * Updates the @id pointer with the index, then stores the entry at that + * index. A concurrent lookup will not see an uninitialised @id. + * + * Context: Process context. Takes and releases the xa_lock while + * disabling softirqs. May sleep if the @gfp flags permit. + * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if + * there is no more space in the XArray. + */ +static inline int xa_alloc_bh(struct xarray *xa, u32 *id, u32 max, void *entry, + gfp_t gfp) +{ + int err; + + xa_lock_bh(xa); + err = __xa_alloc(xa, id, max, entry, gfp); + xa_unlock_bh(xa); + + return err; +} + +/** + * xa_alloc_irq() - Find somewhere to store this entry in the XArray. + * @xa: XArray. + * @id: Pointer to ID. + * @max: Maximum ID to allocate (inclusive). + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * Allocates an unused ID in the range specified by @id and @max. + * Updates the @id pointer with the index, then stores the entry at that + * index. A concurrent lookup will not see an uninitialised @id. + * + * Context: Process context. Takes and releases the xa_lock while + * disabling interrupts. May sleep if the @gfp flags permit. + * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if + * there is no more space in the XArray. + */ +static inline int xa_alloc_irq(struct xarray *xa, u32 *id, u32 max, void *entry, + gfp_t gfp) +{ + int err; + + xa_lock_irq(xa); + err = __xa_alloc(xa, id, max, entry, gfp); + xa_unlock_irq(xa); + + return err; +} + +/* Everything below here is the Advanced API. Proceed with caution. */ + +/* + * The xarray is constructed out of a set of 'chunks' of pointers. Choosing + * the best chunk size requires some tradeoffs. A power of two recommends + * itself so that we can walk the tree based purely on shifts and masks. + * Generally, the larger the better; as the number of slots per level of the + * tree increases, the less tall the tree needs to be. But that needs to be + * balanced against the memory consumption of each node. On a 64-bit system, + * xa_node is currently 576 bytes, and we get 7 of them per 4kB page. If we + * doubled the number of slots per node, we'd get only 3 nodes per 4kB page. + */ +#ifndef XA_CHUNK_SHIFT +#define XA_CHUNK_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) +#endif +#define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT) +#define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1) +#define XA_MAX_MARKS 3 +#define XA_MARK_LONGS DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG) + +/* + * @count is the count of every non-NULL element in the ->slots array + * whether that is a value entry, a retry entry, a user pointer, + * a sibling entry or a pointer to the next level of the tree. + * @nr_values is the count of every element in ->slots which is + * either a value entry or a sibling of a value entry. + */ +struct xa_node { + unsigned char shift; /* Bits remaining in each slot */ + unsigned char offset; /* Slot offset in parent */ + unsigned char count; /* Total entry count */ + unsigned char nr_values; /* Value entry count */ + struct xa_node __rcu *parent; /* NULL at top of tree */ + struct xarray *array; /* The array we belong to */ + union { + struct list_head private_list; /* For tree user */ + struct rcu_head rcu_head; /* Used when freeing node */ + }; + void __rcu *slots[XA_CHUNK_SIZE]; + union { + unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS]; + unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS]; + }; +}; + +void xa_dump(const struct xarray *); +void xa_dump_node(const struct xa_node *); + +#ifdef XA_DEBUG +#define XA_BUG_ON(xa, x) do { \ + if (x) { \ + xa_dump(xa); \ + BUG(); \ + } \ + } while (0) +#define XA_NODE_BUG_ON(node, x) do { \ + if (x) { \ + if (node) xa_dump_node(node); \ + BUG(); \ + } \ + } while (0) +#else +#define XA_BUG_ON(xa, x) do { } while (0) +#define XA_NODE_BUG_ON(node, x) do { } while (0) +#endif + +/* Private */ +static inline void *xa_head(const struct xarray *xa) +{ + return rcu_dereference_check(xa->xa_head, + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline void *xa_head_locked(const struct xarray *xa) +{ + return rcu_dereference_protected(xa->xa_head, + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline void *xa_entry(const struct xarray *xa, + const struct xa_node *node, unsigned int offset) +{ + XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); + return rcu_dereference_check(node->slots[offset], + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline void *xa_entry_locked(const struct xarray *xa, + const struct xa_node *node, unsigned int offset) +{ + XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); + return rcu_dereference_protected(node->slots[offset], + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline struct xa_node *xa_parent(const struct xarray *xa, + const struct xa_node *node) +{ + return rcu_dereference_check(node->parent, + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline struct xa_node *xa_parent_locked(const struct xarray *xa, + const struct xa_node *node) +{ + return rcu_dereference_protected(node->parent, + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline void *xa_mk_node(const struct xa_node *node) +{ + return (void *)((unsigned long)node | 2); +} + +/* Private */ +static inline struct xa_node *xa_to_node(const void *entry) +{ + return (struct xa_node *)((unsigned long)entry - 2); +} + +/* Private */ +static inline bool xa_is_node(const void *entry) +{ + return xa_is_internal(entry) && (unsigned long)entry > 4096; +} + +/* Private */ +static inline void *xa_mk_sibling(unsigned int offset) +{ + return xa_mk_internal(offset); +} + +/* Private */ +static inline unsigned long xa_to_sibling(const void *entry) +{ + return xa_to_internal(entry); +} + +/** + * xa_is_sibling() - Is the entry a sibling entry? + * @entry: Entry retrieved from the XArray + * + * Return: %true if the entry is a sibling entry. + */ +static inline bool xa_is_sibling(const void *entry) +{ + return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) && + (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1)); +} + +#define XA_ZERO_ENTRY xa_mk_internal(256) +#define XA_RETRY_ENTRY xa_mk_internal(257) + +/** + * xa_is_zero() - Is the entry a zero entry? + * @entry: Entry retrieved from the XArray + * + * Return: %true if the entry is a zero entry. + */ +static inline bool xa_is_zero(const void *entry) +{ + return unlikely(entry == XA_ZERO_ENTRY); +} + +/** + * xa_is_retry() - Is the entry a retry entry? + * @entry: Entry retrieved from the XArray + * + * Return: %true if the entry is a retry entry. + */ +static inline bool xa_is_retry(const void *entry) +{ + return unlikely(entry == XA_RETRY_ENTRY); +} + +/** + * typedef xa_update_node_t - A callback function from the XArray. + * @node: The node which is being processed + * + * This function is called every time the XArray updates the count of + * present and value entries in a node. It allows advanced users to + * maintain the private_list in the node. + * + * Context: The xa_lock is held and interrupts may be disabled. + * Implementations should not drop the xa_lock, nor re-enable + * interrupts. + */ +typedef void (*xa_update_node_t)(struct xa_node *node); + +/* + * The xa_state is opaque to its users. It contains various different pieces + * of state involved in the current operation on the XArray. It should be + * declared on the stack and passed between the various internal routines. + * The various elements in it should not be accessed directly, but only + * through the provided accessor functions. The below documentation is for + * the benefit of those working on the code, not for users of the XArray. + * + * @xa_node usually points to the xa_node containing the slot we're operating + * on (and @xa_offset is the offset in the slots array). If there is a + * single entry in the array at index 0, there are no allocated xa_nodes to + * point to, and so we store %NULL in @xa_node. @xa_node is set to + * the value %XAS_RESTART if the xa_state is not walked to the correct + * position in the tree of nodes for this operation. If an error occurs + * during an operation, it is set to an %XAS_ERROR value. If we run off the + * end of the allocated nodes, it is set to %XAS_BOUNDS. + */ +struct xa_state { + struct xarray *xa; + unsigned long xa_index; + unsigned char xa_shift; + unsigned char xa_sibs; + unsigned char xa_offset; + unsigned char xa_pad; /* Helps gcc generate better code */ + struct xa_node *xa_node; + struct xa_node *xa_alloc; + xa_update_node_t xa_update; +}; + +/* + * We encode errnos in the xas->xa_node. If an error has happened, we need to + * drop the lock to fix it, and once we've done so the xa_state is invalid. + */ +#define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL)) +#define XAS_BOUNDS ((struct xa_node *)1UL) +#define XAS_RESTART ((struct xa_node *)3UL) + +#define __XA_STATE(array, index, shift, sibs) { \ + .xa = array, \ + .xa_index = index, \ + .xa_shift = shift, \ + .xa_sibs = sibs, \ + .xa_offset = 0, \ + .xa_pad = 0, \ + .xa_node = XAS_RESTART, \ + .xa_alloc = NULL, \ + .xa_update = NULL \ +} + +/** + * XA_STATE() - Declare an XArray operation state. + * @name: Name of this operation state (usually xas). + * @array: Array to operate on. + * @index: Initial index of interest. + * + * Declare and initialise an xa_state on the stack. + */ +#define XA_STATE(name, array, index) \ + struct xa_state name = __XA_STATE(array, index, 0, 0) + +/** + * XA_STATE_ORDER() - Declare an XArray operation state. + * @name: Name of this operation state (usually xas). + * @array: Array to operate on. + * @index: Initial index of interest. + * @order: Order of entry. + * + * Declare and initialise an xa_state on the stack. This variant of + * XA_STATE() allows you to specify the 'order' of the element you + * want to operate on.` + */ +#define XA_STATE_ORDER(name, array, index, order) \ + struct xa_state name = __XA_STATE(array, \ + (index >> order) << order, \ + order - (order % XA_CHUNK_SHIFT), \ + (1U << (order % XA_CHUNK_SHIFT)) - 1) + +#define xas_marked(xas, mark) xa_marked((xas)->xa, (mark)) +#define xas_trylock(xas) xa_trylock((xas)->xa) +#define xas_lock(xas) xa_lock((xas)->xa) +#define xas_unlock(xas) xa_unlock((xas)->xa) +#define xas_lock_bh(xas) xa_lock_bh((xas)->xa) +#define xas_unlock_bh(xas) xa_unlock_bh((xas)->xa) +#define xas_lock_irq(xas) xa_lock_irq((xas)->xa) +#define xas_unlock_irq(xas) xa_unlock_irq((xas)->xa) +#define xas_lock_irqsave(xas, flags) \ + xa_lock_irqsave((xas)->xa, flags) +#define xas_unlock_irqrestore(xas, flags) \ + xa_unlock_irqrestore((xas)->xa, flags) + +/** + * xas_error() - Return an errno stored in the xa_state. + * @xas: XArray operation state. + * + * Return: 0 if no error has been noted. A negative errno if one has. + */ +static inline int xas_error(const struct xa_state *xas) +{ + return xa_err(xas->xa_node); +} + +/** + * xas_set_err() - Note an error in the xa_state. + * @xas: XArray operation state. + * @err: Negative error number. + * + * Only call this function with a negative @err; zero or positive errors + * will probably not behave the way you think they should. If you want + * to clear the error from an xa_state, use xas_reset(). + */ +static inline void xas_set_err(struct xa_state *xas, long err) +{ + xas->xa_node = XA_ERROR(err); +} + +/** + * xas_invalid() - Is the xas in a retry or error state? + * @xas: XArray operation state. + * + * Return: %true if the xas cannot be used for operations. + */ +static inline bool xas_invalid(const struct xa_state *xas) +{ + return (unsigned long)xas->xa_node & 3; +} + +/** + * xas_valid() - Is the xas a valid cursor into the array? + * @xas: XArray operation state. + * + * Return: %true if the xas can be used for operations. + */ +static inline bool xas_valid(const struct xa_state *xas) +{ + return !xas_invalid(xas); +} + +/** + * xas_is_node() - Does the xas point to a node? + * @xas: XArray operation state. + * + * Return: %true if the xas currently references a node. + */ +static inline bool xas_is_node(const struct xa_state *xas) +{ + return xas_valid(xas) && xas->xa_node; +} + +/* True if the pointer is something other than a node */ +static inline bool xas_not_node(struct xa_node *node) +{ + return ((unsigned long)node & 3) || !node; +} + +/* True if the node represents RESTART or an error */ +static inline bool xas_frozen(struct xa_node *node) +{ + return (unsigned long)node & 2; +} + +/* True if the node represents head-of-tree, RESTART or BOUNDS */ +static inline bool xas_top(struct xa_node *node) +{ + return node <= XAS_RESTART; +} + +/** + * xas_reset() - Reset an XArray operation state. + * @xas: XArray operation state. + * + * Resets the error or walk state of the @xas so future walks of the + * array will start from the root. Use this if you have dropped the + * xarray lock and want to reuse the xa_state. + * + * Context: Any context. + */ +static inline void xas_reset(struct xa_state *xas) +{ + xas->xa_node = XAS_RESTART; +} + +/** + * xas_retry() - Retry the operation if appropriate. + * @xas: XArray operation state. + * @entry: Entry from xarray. + * + * The advanced functions may sometimes return an internal entry, such as + * a retry entry or a zero entry. This function sets up the @xas to restart + * the walk from the head of the array if needed. + * + * Context: Any context. + * Return: true if the operation needs to be retried. + */ +static inline bool xas_retry(struct xa_state *xas, const void *entry) +{ + if (xa_is_zero(entry)) + return true; + if (!xa_is_retry(entry)) + return false; + xas_reset(xas); + return true; +} + +void *xas_load(struct xa_state *); +void *xas_store(struct xa_state *, void *entry); +void *xas_find(struct xa_state *, unsigned long max); +void *xas_find_conflict(struct xa_state *); + +bool xas_get_mark(const struct xa_state *, xa_mark_t); +void xas_set_mark(const struct xa_state *, xa_mark_t); +void xas_clear_mark(const struct xa_state *, xa_mark_t); +void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t); +void xas_init_marks(const struct xa_state *); + +bool xas_nomem(struct xa_state *, gfp_t); +void xas_pause(struct xa_state *); + +void xas_create_range(struct xa_state *); + +/** + * xas_reload() - Refetch an entry from the xarray. + * @xas: XArray operation state. + * + * Use this function to check that a previously loaded entry still has + * the same value. This is useful for the lockless pagecache lookup where + * we walk the array with only the RCU lock to protect us, lock the page, + * then check that the page hasn't moved since we looked it up. + * + * The caller guarantees that @xas is still valid. If it may be in an + * error or restart state, call xas_load() instead. + * + * Return: The entry at this location in the xarray. + */ +static inline void *xas_reload(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + + if (node) + return xa_entry(xas->xa, node, xas->xa_offset); + return xa_head(xas->xa); +} + +/** + * xas_set() - Set up XArray operation state for a different index. + * @xas: XArray operation state. + * @index: New index into the XArray. + * + * Move the operation state to refer to a different index. This will + * have the effect of starting a walk from the top; see xas_next() + * to move to an adjacent index. + */ +static inline void xas_set(struct xa_state *xas, unsigned long index) +{ + xas->xa_index = index; + xas->xa_node = XAS_RESTART; +} + +/** + * xas_set_order() - Set up XArray operation state for a multislot entry. + * @xas: XArray operation state. + * @index: Target of the operation. + * @order: Entry occupies 2^@order indices. + */ +static inline void xas_set_order(struct xa_state *xas, unsigned long index, + unsigned int order) +{ +#ifdef CONFIG_XARRAY_MULTI + xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0; + xas->xa_shift = order - (order % XA_CHUNK_SHIFT); + xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; + xas->xa_node = XAS_RESTART; +#else + BUG_ON(order > 0); + xas_set(xas, index); +#endif +} + +/** + * xas_set_update() - Set up XArray operation state for a callback. + * @xas: XArray operation state. + * @update: Function to call when updating a node. + * + * The XArray can notify a caller after it has updated an xa_node. + * This is advanced functionality and is only needed by the page cache. + */ +static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update) +{ + xas->xa_update = update; +} + +/** + * xas_next_entry() - Advance iterator to next present entry. + * @xas: XArray operation state. + * @max: Highest index to return. + * + * xas_next_entry() is an inline function to optimise xarray traversal for + * speed. It is equivalent to calling xas_find(), and will call xas_find() + * for all the hard cases. + * + * Return: The next present entry after the one currently referred to by @xas. + */ +static inline void *xas_next_entry(struct xa_state *xas, unsigned long max) +{ + struct xa_node *node = xas->xa_node; + void *entry; + + if (unlikely(xas_not_node(node) || node->shift || + xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK))) + return xas_find(xas, max); + + do { + if (unlikely(xas->xa_index >= max)) + return xas_find(xas, max); + if (unlikely(xas->xa_offset == XA_CHUNK_MASK)) + return xas_find(xas, max); + entry = xa_entry(xas->xa, node, xas->xa_offset + 1); + if (unlikely(xa_is_internal(entry))) + return xas_find(xas, max); + xas->xa_offset++; + xas->xa_index++; + } while (!entry); + + return entry; +} + +/* Private */ +static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance, + xa_mark_t mark) +{ + unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark]; + unsigned int offset = xas->xa_offset; + + if (advance) + offset++; + if (XA_CHUNK_SIZE == BITS_PER_LONG) { + if (offset < XA_CHUNK_SIZE) { + unsigned long data = *addr & (~0UL << offset); + if (data) + return __ffs(data); + } + return XA_CHUNK_SIZE; + } + + return find_next_bit(addr, XA_CHUNK_SIZE, offset); +} + +/** + * xas_next_marked() - Advance iterator to next marked entry. + * @xas: XArray operation state. + * @max: Highest index to return. + * @mark: Mark to search for. + * + * xas_next_marked() is an inline function to optimise xarray traversal for + * speed. It is equivalent to calling xas_find_marked(), and will call + * xas_find_marked() for all the hard cases. + * + * Return: The next marked entry after the one currently referred to by @xas. + */ +static inline void *xas_next_marked(struct xa_state *xas, unsigned long max, + xa_mark_t mark) +{ + struct xa_node *node = xas->xa_node; + unsigned int offset; + + if (unlikely(xas_not_node(node) || node->shift)) + return xas_find_marked(xas, max, mark); + offset = xas_find_chunk(xas, true, mark); + xas->xa_offset = offset; + xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset; + if (xas->xa_index > max) + return NULL; + if (offset == XA_CHUNK_SIZE) + return xas_find_marked(xas, max, mark); + return xa_entry(xas->xa, node, offset); +} + +/* + * If iterating while holding a lock, drop the lock and reschedule + * every %XA_CHECK_SCHED loops. + */ +enum { + XA_CHECK_SCHED = 4096, +}; + +/** + * xas_for_each() - Iterate over a range of an XArray. + * @xas: XArray operation state. + * @entry: Entry retrieved from the array. + * @max: Maximum index to retrieve from array. + * + * The loop body will be executed for each entry present in the xarray + * between the current xas position and @max. @entry will be set to + * the entry retrieved from the xarray. It is safe to delete entries + * from the array in the loop body. You should hold either the RCU lock + * or the xa_lock while iterating. If you need to drop the lock, call + * xas_pause() first. + */ +#define xas_for_each(xas, entry, max) \ + for (entry = xas_find(xas, max); entry; \ + entry = xas_next_entry(xas, max)) + +/** + * xas_for_each_marked() - Iterate over a range of an XArray. + * @xas: XArray operation state. + * @entry: Entry retrieved from the array. + * @max: Maximum index to retrieve from array. + * @mark: Mark to search for. + * + * The loop body will be executed for each marked entry in the xarray + * between the current xas position and @max. @entry will be set to + * the entry retrieved from the xarray. It is safe to delete entries + * from the array in the loop body. You should hold either the RCU lock + * or the xa_lock while iterating. If you need to drop the lock, call + * xas_pause() first. + */ +#define xas_for_each_marked(xas, entry, max, mark) \ + for (entry = xas_find_marked(xas, max, mark); entry; \ + entry = xas_next_marked(xas, max, mark)) + +/** + * xas_for_each_conflict() - Iterate over a range of an XArray. + * @xas: XArray operation state. + * @entry: Entry retrieved from the array. + * + * The loop body will be executed for each entry in the XArray that lies + * within the range specified by @xas. If the loop completes successfully, + * any entries that lie in this range will be replaced by @entry. The caller + * may break out of the loop; if they do so, the contents of the XArray will + * be unchanged. The operation may fail due to an out of memory condition. + * The caller may also call xa_set_err() to exit the loop while setting an + * error to record the reason. + */ +#define xas_for_each_conflict(xas, entry) \ + while ((entry = xas_find_conflict(xas))) + +void *__xas_next(struct xa_state *); +void *__xas_prev(struct xa_state *); + +/** + * xas_prev() - Move iterator to previous index. + * @xas: XArray operation state. + * + * If the @xas was in an error state, it will remain in an error state + * and this function will return %NULL. If the @xas has never been walked, + * it will have the effect of calling xas_load(). Otherwise one will be + * subtracted from the index and the state will be walked to the correct + * location in the array for the next operation. + * + * If the iterator was referencing index 0, this function wraps + * around to %ULONG_MAX. + * + * Return: The entry at the new index. This may be %NULL or an internal + * entry. + */ +static inline void *xas_prev(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + + if (unlikely(xas_not_node(node) || node->shift || + xas->xa_offset == 0)) + return __xas_prev(xas); + + xas->xa_index--; + xas->xa_offset--; + return xa_entry(xas->xa, node, xas->xa_offset); +} + +/** + * xas_next() - Move state to next index. + * @xas: XArray operation state. + * + * If the @xas was in an error state, it will remain in an error state + * and this function will return %NULL. If the @xas has never been walked, + * it will have the effect of calling xas_load(). Otherwise one will be + * added to the index and the state will be walked to the correct + * location in the array for the next operation. + * + * If the iterator was referencing index %ULONG_MAX, this function wraps + * around to 0. + * + * Return: The entry at the new index. This may be %NULL or an internal + * entry. + */ +static inline void *xas_next(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + + if (unlikely(xas_not_node(node) || node->shift || + xas->xa_offset == XA_CHUNK_MASK)) + return __xas_next(xas); + + xas->xa_index++; + xas->xa_offset++; + return xa_entry(xas->xa, node, xas->xa_offset); +} + #endif /* _LINUX_XARRAY_H */ |