diff options
-rw-r--r-- | 0000_README | 4 | ||||
-rw-r--r-- | 1175_linux-5.10.176.patch | 8752 |
2 files changed, 8756 insertions, 0 deletions
diff --git a/0000_README b/0000_README index f3c2cbaf..50964ce7 100644 --- a/0000_README +++ b/0000_README @@ -743,6 +743,10 @@ Patch: 1174_linux-5.10.175.patch From: https://www.kernel.org Desc: Linux 5.10.175 +Patch: 1175_linux-5.10.176.patch +From: https://www.kernel.org +Desc: Linux 5.10.176 + Patch: 1500_XATTR_USER_PREFIX.patch From: https://bugs.gentoo.org/show_bug.cgi?id=470644 Desc: Support for namespace user.pax.* on tmpfs. diff --git a/1175_linux-5.10.176.patch b/1175_linux-5.10.176.patch new file mode 100644 index 00000000..8c46ce2a --- /dev/null +++ b/1175_linux-5.10.176.patch @@ -0,0 +1,8752 @@ +diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst +index ca52c82e5bb54..f7b69a0e71e1c 100644 +--- a/Documentation/filesystems/vfs.rst ++++ b/Documentation/filesystems/vfs.rst +@@ -1188,7 +1188,7 @@ defined: + return + -ECHILD and it will be called again in ref-walk mode. + +-``_weak_revalidate`` ++``d_weak_revalidate`` + called when the VFS needs to revalidate a "jumped" dentry. This + is called when a path-walk ends at dentry that was not acquired + by doing a lookup in the parent directory. This includes "/", +diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst +index 87cf5c010d5dd..ed2e45f9b7627 100644 +--- a/Documentation/trace/ftrace.rst ++++ b/Documentation/trace/ftrace.rst +@@ -2923,7 +2923,7 @@ Produces:: + bash-1994 [000] .... 4342.324898: ima_get_action <-process_measurement + bash-1994 [000] .... 4342.324898: ima_match_policy <-ima_get_action + bash-1994 [000] .... 4342.324899: do_truncate <-do_last +- bash-1994 [000] .... 4342.324899: should_remove_suid <-do_truncate ++ bash-1994 [000] .... 4342.324899: setattr_should_drop_suidgid <-do_truncate + bash-1994 [000] .... 4342.324899: notify_change <-do_truncate + bash-1994 [000] .... 4342.324900: current_fs_time <-notify_change + bash-1994 [000] .... 4342.324900: current_kernel_time <-current_fs_time +diff --git a/Makefile b/Makefile +index e6b09052f222b..71caf59383615 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,7 +1,7 @@ + # SPDX-License-Identifier: GPL-2.0 + VERSION = 5 + PATCHLEVEL = 10 +-SUBLEVEL = 175 ++SUBLEVEL = 176 + EXTRAVERSION = + NAME = Dare mighty things + +diff --git a/arch/s390/boot/ipl_report.c b/arch/s390/boot/ipl_report.c +index 0b4965573656f..88bacf4999c47 100644 +--- a/arch/s390/boot/ipl_report.c ++++ b/arch/s390/boot/ipl_report.c +@@ -57,11 +57,19 @@ repeat: + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE && + intersects(INITRD_START, INITRD_SIZE, safe_addr, size)) + safe_addr = INITRD_START + INITRD_SIZE; ++ if (intersects(safe_addr, size, (unsigned long)comps, comps->len)) { ++ safe_addr = (unsigned long)comps + comps->len; ++ goto repeat; ++ } + for_each_rb_entry(comp, comps) + if (intersects(safe_addr, size, comp->addr, comp->len)) { + safe_addr = comp->addr + comp->len; + goto repeat; + } ++ if (intersects(safe_addr, size, (unsigned long)certs, certs->len)) { ++ safe_addr = (unsigned long)certs + certs->len; ++ goto repeat; ++ } + for_each_rb_entry(cert, certs) + if (intersects(safe_addr, size, cert->addr, cert->len)) { + safe_addr = cert->addr + cert->len; +diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c +index 1906387a0faf4..0b7c81389c50a 100644 +--- a/arch/x86/kernel/cpu/mce/core.c ++++ b/arch/x86/kernel/cpu/mce/core.c +@@ -2309,6 +2309,7 @@ static void mce_restart(void) + { + mce_timer_delete_all(); + on_each_cpu(mce_cpu_restart, NULL, 1); ++ mce_schedule_work(); + } + + /* Toggle features for corrected errors */ +diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c +index 91371b01eae0c..c165ddbb672fe 100644 +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -2998,7 +2998,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, + enum vm_entry_failure_code *entry_failure_code) + { +- bool ia32e; ++ bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); + + *entry_failure_code = ENTRY_FAIL_DEFAULT; + +@@ -3024,6 +3024,13 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, + vmcs12->guest_ia32_perf_global_ctrl))) + return -EINVAL; + ++ if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) ++ return -EINVAL; ++ ++ if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || ++ CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) ++ return -EINVAL; ++ + /* + * If the load IA32_EFER VM-entry control is 1, the following checks + * are performed on the field for the IA32_EFER MSR: +@@ -3035,7 +3042,6 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, + */ + if (to_vmx(vcpu)->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { +- ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; + if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || + CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || + CC(((vmcs12->guest_cr0 & X86_CR0_PG) && +diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c +index 011e042b47ba7..5ec47af786ddb 100644 +--- a/arch/x86/mm/mem_encrypt_identity.c ++++ b/arch/x86/mm/mem_encrypt_identity.c +@@ -586,7 +586,8 @@ void __init sme_enable(struct boot_params *bp) + cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | + ((u64)bp->ext_cmd_line_ptr << 32)); + +- cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)); ++ if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0) ++ return; + + if (!strncmp(buffer, cmdline_on, sizeof(buffer))) + sme_me_mask = me_mask; +diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig +index 40c53632512b7..9617688b58b32 100644 +--- a/drivers/block/Kconfig ++++ b/drivers/block/Kconfig +@@ -16,13 +16,7 @@ menuconfig BLK_DEV + + if BLK_DEV + +-config BLK_DEV_NULL_BLK +- tristate "Null test block driver" +- select CONFIGFS_FS +- +-config BLK_DEV_NULL_BLK_FAULT_INJECTION +- bool "Support fault injection for Null test block driver" +- depends on BLK_DEV_NULL_BLK && FAULT_INJECTION ++source "drivers/block/null_blk/Kconfig" + + config BLK_DEV_FD + tristate "Normal floppy disk support" +diff --git a/drivers/block/Makefile b/drivers/block/Makefile +index e1f63117ee94f..a3170859e01d4 100644 +--- a/drivers/block/Makefile ++++ b/drivers/block/Makefile +@@ -41,12 +41,7 @@ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ + obj-$(CONFIG_ZRAM) += zram/ + obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/ + +-obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o +-null_blk-objs := null_blk_main.o +-ifeq ($(CONFIG_BLK_DEV_ZONED), y) +-null_blk-$(CONFIG_TRACING) += null_blk_trace.o +-endif +-null_blk-$(CONFIG_BLK_DEV_ZONED) += null_blk_zoned.o ++obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/ + + skd-y := skd_main.o + swim_mod-y := swim.o swim_asm.o +diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h +deleted file mode 100644 +index 7de703f28617b..0000000000000 +--- a/drivers/block/null_blk.h ++++ /dev/null +@@ -1,137 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef __BLK_NULL_BLK_H +-#define __BLK_NULL_BLK_H +- +-#undef pr_fmt +-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +- +-#include <linux/blkdev.h> +-#include <linux/slab.h> +-#include <linux/blk-mq.h> +-#include <linux/hrtimer.h> +-#include <linux/configfs.h> +-#include <linux/badblocks.h> +-#include <linux/fault-inject.h> +- +-struct nullb_cmd { +- struct request *rq; +- struct bio *bio; +- unsigned int tag; +- blk_status_t error; +- struct nullb_queue *nq; +- struct hrtimer timer; +- bool fake_timeout; +-}; +- +-struct nullb_queue { +- unsigned long *tag_map; +- wait_queue_head_t wait; +- unsigned int queue_depth; +- struct nullb_device *dev; +- unsigned int requeue_selection; +- +- struct nullb_cmd *cmds; +-}; +- +-struct nullb_device { +- struct nullb *nullb; +- struct config_item item; +- struct radix_tree_root data; /* data stored in the disk */ +- struct radix_tree_root cache; /* disk cache data */ +- unsigned long flags; /* device flags */ +- unsigned int curr_cache; +- struct badblocks badblocks; +- +- unsigned int nr_zones; +- unsigned int nr_zones_imp_open; +- unsigned int nr_zones_exp_open; +- unsigned int nr_zones_closed; +- struct blk_zone *zones; +- sector_t zone_size_sects; +- spinlock_t zone_lock; +- unsigned long *zone_locks; +- +- unsigned long size; /* device size in MB */ +- unsigned long completion_nsec; /* time in ns to complete a request */ +- unsigned long cache_size; /* disk cache size in MB */ +- unsigned long zone_size; /* zone size in MB if device is zoned */ +- unsigned long zone_capacity; /* zone capacity in MB if device is zoned */ +- unsigned int zone_nr_conv; /* number of conventional zones */ +- unsigned int zone_max_open; /* max number of open zones */ +- unsigned int zone_max_active; /* max number of active zones */ +- unsigned int submit_queues; /* number of submission queues */ +- unsigned int home_node; /* home node for the device */ +- unsigned int queue_mode; /* block interface */ +- unsigned int blocksize; /* block size */ +- unsigned int irqmode; /* IRQ completion handler */ +- unsigned int hw_queue_depth; /* queue depth */ +- unsigned int index; /* index of the disk, only valid with a disk */ +- unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ +- bool blocking; /* blocking blk-mq device */ +- bool use_per_node_hctx; /* use per-node allocation for hardware context */ +- bool power; /* power on/off the device */ +- bool memory_backed; /* if data is stored in memory */ +- bool discard; /* if support discard */ +- bool zoned; /* if device is zoned */ +-}; +- +-struct nullb { +- struct nullb_device *dev; +- struct list_head list; +- unsigned int index; +- struct request_queue *q; +- struct gendisk *disk; +- struct blk_mq_tag_set *tag_set; +- struct blk_mq_tag_set __tag_set; +- unsigned int queue_depth; +- atomic_long_t cur_bytes; +- struct hrtimer bw_timer; +- unsigned long cache_flush_pos; +- spinlock_t lock; +- +- struct nullb_queue *queues; +- unsigned int nr_queues; +- char disk_name[DISK_NAME_LEN]; +-}; +- +-blk_status_t null_process_cmd(struct nullb_cmd *cmd, +- enum req_opf op, sector_t sector, +- unsigned int nr_sectors); +- +-#ifdef CONFIG_BLK_DEV_ZONED +-int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q); +-int null_register_zoned_dev(struct nullb *nullb); +-void null_free_zoned_dev(struct nullb_device *dev); +-int null_report_zones(struct gendisk *disk, sector_t sector, +- unsigned int nr_zones, report_zones_cb cb, void *data); +-blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, +- enum req_opf op, sector_t sector, +- sector_t nr_sectors); +-size_t null_zone_valid_read_len(struct nullb *nullb, +- sector_t sector, unsigned int len); +-#else +-static inline int null_init_zoned_dev(struct nullb_device *dev, +- struct request_queue *q) +-{ +- pr_err("CONFIG_BLK_DEV_ZONED not enabled\n"); +- return -EINVAL; +-} +-static inline int null_register_zoned_dev(struct nullb *nullb) +-{ +- return -ENODEV; +-} +-static inline void null_free_zoned_dev(struct nullb_device *dev) {} +-static inline blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, +- enum req_opf op, sector_t sector, sector_t nr_sectors) +-{ +- return BLK_STS_NOTSUPP; +-} +-static inline size_t null_zone_valid_read_len(struct nullb *nullb, +- sector_t sector, +- unsigned int len) +-{ +- return len; +-} +-#define null_report_zones NULL +-#endif /* CONFIG_BLK_DEV_ZONED */ +-#endif /* __NULL_BLK_H */ +diff --git a/drivers/block/null_blk/Kconfig b/drivers/block/null_blk/Kconfig +new file mode 100644 +index 0000000000000..6bf1f8ca20a24 +--- /dev/null ++++ b/drivers/block/null_blk/Kconfig +@@ -0,0 +1,12 @@ ++# SPDX-License-Identifier: GPL-2.0 ++# ++# Null block device driver configuration ++# ++ ++config BLK_DEV_NULL_BLK ++ tristate "Null test block driver" ++ select CONFIGFS_FS ++ ++config BLK_DEV_NULL_BLK_FAULT_INJECTION ++ bool "Support fault injection for Null test block driver" ++ depends on BLK_DEV_NULL_BLK && FAULT_INJECTION +diff --git a/drivers/block/null_blk/Makefile b/drivers/block/null_blk/Makefile +new file mode 100644 +index 0000000000000..84c36e512ab89 +--- /dev/null ++++ b/drivers/block/null_blk/Makefile +@@ -0,0 +1,11 @@ ++# SPDX-License-Identifier: GPL-2.0 ++ ++# needed for trace events ++ccflags-y += -I$(src) ++ ++obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o ++null_blk-objs := main.o ++ifeq ($(CONFIG_BLK_DEV_ZONED), y) ++null_blk-$(CONFIG_TRACING) += trace.o ++endif ++null_blk-$(CONFIG_BLK_DEV_ZONED) += zoned.o +diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c +new file mode 100644 +index 0000000000000..25db095e943b7 +--- /dev/null ++++ b/drivers/block/null_blk/main.c +@@ -0,0 +1,2036 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* ++ * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and ++ * Shaohua Li <shli@fb.com> ++ */ ++#include <linux/module.h> ++ ++#include <linux/moduleparam.h> ++#include <linux/sched.h> ++#include <linux/fs.h> ++#include <linux/init.h> ++#include "null_blk.h" ++ ++#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) ++#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) ++#define SECTOR_MASK (PAGE_SECTORS - 1) ++ ++#define FREE_BATCH 16 ++ ++#define TICKS_PER_SEC 50ULL ++#define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) ++ ++#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION ++static DECLARE_FAULT_ATTR(null_timeout_attr); ++static DECLARE_FAULT_ATTR(null_requeue_attr); ++static DECLARE_FAULT_ATTR(null_init_hctx_attr); ++#endif ++ ++static inline u64 mb_per_tick(int mbps) ++{ ++ return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); ++} ++ ++/* ++ * Status flags for nullb_device. ++ * ++ * CONFIGURED: Device has been configured and turned on. Cannot reconfigure. ++ * UP: Device is currently on and visible in userspace. ++ * THROTTLED: Device is being throttled. ++ * CACHE: Device is using a write-back cache. ++ */ ++enum nullb_device_flags { ++ NULLB_DEV_FL_CONFIGURED = 0, ++ NULLB_DEV_FL_UP = 1, ++ NULLB_DEV_FL_THROTTLED = 2, ++ NULLB_DEV_FL_CACHE = 3, ++}; ++ ++#define MAP_SZ ((PAGE_SIZE >> SECTOR_SHIFT) + 2) ++/* ++ * nullb_page is a page in memory for nullb devices. ++ * ++ * @page: The page holding the data. ++ * @bitmap: The bitmap represents which sector in the page has data. ++ * Each bit represents one block size. For example, sector 8 ++ * will use the 7th bit ++ * The highest 2 bits of bitmap are for special purpose. LOCK means the cache ++ * page is being flushing to storage. FREE means the cache page is freed and ++ * should be skipped from flushing to storage. Please see ++ * null_make_cache_space ++ */ ++struct nullb_page { ++ struct page *page; ++ DECLARE_BITMAP(bitmap, MAP_SZ); ++}; ++#define NULLB_PAGE_LOCK (MAP_SZ - 1) ++#define NULLB_PAGE_FREE (MAP_SZ - 2) ++ ++static LIST_HEAD(nullb_list); ++static struct mutex lock; ++static int null_major; ++static DEFINE_IDA(nullb_indexes); ++static struct blk_mq_tag_set tag_set; ++ ++enum { ++ NULL_IRQ_NONE = 0, ++ NULL_IRQ_SOFTIRQ = 1, ++ NULL_IRQ_TIMER = 2, ++}; ++ ++enum { ++ NULL_Q_BIO = 0, ++ NULL_Q_RQ = 1, ++ NULL_Q_MQ = 2, ++}; ++ ++static int g_no_sched; ++module_param_named(no_sched, g_no_sched, int, 0444); ++MODULE_PARM_DESC(no_sched, "No io scheduler"); ++ ++static int g_submit_queues = 1; ++module_param_named(submit_queues, g_submit_queues, int, 0444); ++MODULE_PARM_DESC(submit_queues, "Number of submission queues"); ++ ++static int g_home_node = NUMA_NO_NODE; ++module_param_named(home_node, g_home_node, int, 0444); ++MODULE_PARM_DESC(home_node, "Home node for the device"); ++ ++#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION ++/* ++ * For more details about fault injection, please refer to ++ * Documentation/fault-injection/fault-injection.rst. ++ */ ++static char g_timeout_str[80]; ++module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444); ++MODULE_PARM_DESC(timeout, "Fault injection. timeout=<interval>,<probability>,<space>,<times>"); ++ ++static char g_requeue_str[80]; ++module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444); ++MODULE_PARM_DESC(requeue, "Fault injection. requeue=<interval>,<probability>,<space>,<times>"); ++ ++static char g_init_hctx_str[80]; ++module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444); ++MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>"); ++#endif ++ ++static int g_queue_mode = NULL_Q_MQ; ++ ++static int null_param_store_val(const char *str, int *val, int min, int max) ++{ ++ int ret, new_val; ++ ++ ret = kstrtoint(str, 10, &new_val); ++ if (ret) ++ return -EINVAL; ++ ++ if (new_val < min || new_val > max) ++ return -EINVAL; ++ ++ *val = new_val; ++ return 0; ++} ++ ++static int null_set_queue_mode(const char *str, const struct kernel_param *kp) ++{ ++ return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ); ++} ++ ++static const struct kernel_param_ops null_queue_mode_param_ops = { ++ .set = null_set_queue_mode, ++ .get = param_get_int, ++}; ++ ++device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444); ++MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); ++ ++static int g_gb = 250; ++module_param_named(gb, g_gb, int, 0444); ++MODULE_PARM_DESC(gb, "Size in GB"); ++ ++static int g_bs = 512; ++module_param_named(bs, g_bs, int, 0444); ++MODULE_PARM_DESC(bs, "Block size (in bytes)"); ++ ++static unsigned int nr_devices = 1; ++module_param(nr_devices, uint, 0444); ++MODULE_PARM_DESC(nr_devices, "Number of devices to register"); ++ ++static bool g_blocking; ++module_param_named(blocking, g_blocking, bool, 0444); ++MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); ++ ++static bool shared_tags; ++module_param(shared_tags, bool, 0444); ++MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); ++ ++static bool g_shared_tag_bitmap; ++module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444); ++MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq"); ++ ++static int g_irqmode = NULL_IRQ_SOFTIRQ; ++ ++static int null_set_irqmode(const char *str, const struct kernel_param *kp) ++{ ++ return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE, ++ NULL_IRQ_TIMER); ++} ++ ++static const struct kernel_param_ops null_irqmode_param_ops = { ++ .set = null_set_irqmode, ++ .get = param_get_int, ++}; ++ ++device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444); ++MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); ++ ++static unsigned long g_completion_nsec = 10000; ++module_param_named(completion_nsec, g_completion_nsec, ulong, 0444); ++MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); ++ ++static int g_hw_queue_depth = 64; ++module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444); ++MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); ++ ++static bool g_use_per_node_hctx; ++module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); ++MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); ++ ++static bool g_zoned; ++module_param_named(zoned, g_zoned, bool, S_IRUGO); ++MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); ++ ++static unsigned long g_zone_size = 256; ++module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); ++MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); ++ ++static unsigned long g_zone_capacity; ++module_param_named(zone_capacity, g_zone_capacity, ulong, 0444); ++MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size"); ++ ++static unsigned int g_zone_nr_conv; ++module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444); ++MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0"); ++ ++static unsigned int g_zone_max_open; ++module_param_named(zone_max_open, g_zone_max_open, uint, 0444); ++MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)"); ++ ++static unsigned int g_zone_max_active; ++module_param_named(zone_max_active, g_zone_max_active, uint, 0444); ++MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); ++ ++static struct nullb_device *null_alloc_dev(void); ++static void null_free_dev(struct nullb_device *dev); ++static void null_del_dev(struct nullb *nullb); ++static int null_add_dev(struct nullb_device *dev); ++static void null_free_device_storage(struct nullb_device *dev, bool is_cache); ++ ++static inline struct nullb_device *to_nullb_device(struct config_item *item) ++{ ++ return item ? container_of(item, struct nullb_device, item) : NULL; ++} ++ ++static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page) ++{ ++ return snprintf(page, PAGE_SIZE, "%u\n", val); ++} ++ ++static inline ssize_t nullb_device_ulong_attr_show(unsigned long val, ++ char *page) ++{ ++ return snprintf(page, PAGE_SIZE, "%lu\n", val); ++} ++ ++static inline ssize_t nullb_device_bool_attr_show(bool val, char *page) ++{ ++ return snprintf(page, PAGE_SIZE, "%u\n", val); ++} ++ ++static ssize_t nullb_device_uint_attr_store(unsigned int *val, ++ const char *page, size_t count) ++{ ++ unsigned int tmp; ++ int result; ++ ++ result = kstrtouint(page, 0, &tmp); ++ if (result < 0) ++ return result; ++ ++ *val = tmp; ++ return count; ++} ++ ++static ssize_t nullb_device_ulong_attr_store(unsigned long *val, ++ const char *page, size_t count) ++{ ++ int result; ++ unsigned long tmp; ++ ++ result = kstrtoul(page, 0, &tmp); ++ if (result < 0) ++ return result; ++ ++ *val = tmp; ++ return count; ++} ++ ++static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, ++ size_t count) ++{ ++ bool tmp; ++ int result; ++ ++ result = kstrtobool(page, &tmp); ++ if (result < 0) ++ return result; ++ ++ *val = tmp; ++ return count; ++} ++ ++/* The following macro should only be used with TYPE = {uint, ulong, bool}. */ ++#define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY) \ ++static ssize_t \ ++nullb_device_##NAME##_show(struct config_item *item, char *page) \ ++{ \ ++ return nullb_device_##TYPE##_attr_show( \ ++ to_nullb_device(item)->NAME, page); \ ++} \ ++static ssize_t \ ++nullb_device_##NAME##_store(struct config_item *item, const char *page, \ ++ size_t count) \ ++{ \ ++ int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\ ++ struct nullb_device *dev = to_nullb_device(item); \ ++ TYPE new_value = 0; \ ++ int ret; \ ++ \ ++ ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\ ++ if (ret < 0) \ ++ return ret; \ ++ if (apply_fn) \ ++ ret = apply_fn(dev, new_value); \ ++ else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) \ ++ ret = -EBUSY; \ ++ if (ret < 0) \ ++ return ret; \ ++ dev->NAME = new_value; \ ++ return count; \ ++} \ ++CONFIGFS_ATTR(nullb_device_, NAME); ++ ++static int nullb_apply_submit_queues(struct nullb_device *dev, ++ unsigned int submit_queues) ++{ ++ struct nullb *nullb = dev->nullb; ++ struct blk_mq_tag_set *set; ++ ++ if (!nullb) ++ return 0; ++ ++ /* ++ * Make sure that null_init_hctx() does not access nullb->queues[] past ++ * the end of that array. ++ */ ++ if (submit_queues > nr_cpu_ids) ++ return -EINVAL; ++ set = nullb->tag_set; ++ blk_mq_update_nr_hw_queues(set, submit_queues); ++ return set->nr_hw_queues == submit_queues ? 0 : -ENOMEM; ++} ++ ++NULLB_DEVICE_ATTR(size, ulong, NULL); ++NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL); ++NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues); ++NULLB_DEVICE_ATTR(home_node, uint, NULL); ++NULLB_DEVICE_ATTR(queue_mode, uint, NULL); ++NULLB_DEVICE_ATTR(blocksize, uint, NULL); ++NULLB_DEVICE_ATTR(irqmode, uint, NULL); ++NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); ++NULLB_DEVICE_ATTR(index, uint, NULL); ++NULLB_DEVICE_ATTR(blocking, bool, NULL); ++NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL); ++NULLB_DEVICE_ATTR(memory_backed, bool, NULL); ++NULLB_DEVICE_ATTR(discard, bool, NULL); ++NULLB_DEVICE_ATTR(mbps, uint, NULL); ++NULLB_DEVICE_ATTR(cache_size, ulong, NULL); ++NULLB_DEVICE_ATTR(zoned, bool, NULL); ++NULLB_DEVICE_ATTR(zone_size, ulong, NULL); ++NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); ++NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); ++NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); ++NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); ++ ++static ssize_t nullb_device_power_show(struct config_item *item, char *page) ++{ ++ return nullb_device_bool_attr_show(to_nullb_device(item)->power, page); ++} ++ ++static ssize_t nullb_device_power_store(struct config_item *item, ++ const char *page, size_t count) ++{ ++ struct nullb_device *dev = to_nullb_device(item); ++ bool newp = false; ++ ssize_t ret; ++ ++ ret = nullb_device_bool_attr_store(&newp, page, count); ++ if (ret < 0) ++ return ret; ++ ++ if (!dev->power && newp) { ++ if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) ++ return count; ++ if (null_add_dev(dev)) { ++ clear_bit(NULLB_DEV_FL_UP, &dev->flags); ++ return -ENOMEM; ++ } ++ ++ set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); ++ dev->power = newp; ++ } else if (dev->power && !newp) { ++ if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { ++ mutex_lock(&lock); ++ dev->power = newp; ++ null_del_dev(dev->nullb); ++ mutex_unlock(&lock); ++ } ++ clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); ++ } ++ ++ return count; ++} ++ ++CONFIGFS_ATTR(nullb_device_, power); ++ ++static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page) ++{ ++ struct nullb_device *t_dev = to_nullb_device(item); ++ ++ return badblocks_show(&t_dev->badblocks, page, 0); ++} ++ ++static ssize_t nullb_device_badblocks_store(struct config_item *item, ++ const char *page, size_t count) ++{ ++ struct nullb_device *t_dev = to_nullb_device(item); ++ char *orig, *buf, *tmp; ++ u64 start, end; ++ int ret; ++ ++ orig = kstrndup(page, count, GFP_KERNEL); ++ if (!orig) ++ return -ENOMEM; ++ ++ buf = strstrip(orig); ++ ++ ret = -EINVAL; ++ if (buf[0] != '+' && buf[0] != '-') ++ goto out; ++ tmp = strchr(&buf[1], '-'); ++ if (!tmp) ++ goto out; ++ *tmp = '\0'; ++ ret = kstrtoull(buf + 1, 0, &start); ++ if (ret) ++ goto out; ++ ret = kstrtoull(tmp + 1, 0, &end); ++ if (ret) ++ goto out; ++ ret = -EINVAL; ++ if (start > end) ++ goto out; ++ /* enable badblocks */ ++ cmpxchg(&t_dev->badblocks.shift, -1, 0); ++ if (buf[0] == '+') ++ ret = badblocks_set(&t_dev->badblocks, start, ++ end - start + 1, 1); ++ else ++ ret = badblocks_clear(&t_dev->badblocks, start, ++ end - start + 1); ++ if (ret == 0) ++ ret = count; ++out: ++ kfree(orig); ++ return ret; ++} ++CONFIGFS_ATTR(nullb_device_, badblocks); ++ ++static struct configfs_attribute *nullb_device_attrs[] = { ++ &nullb_device_attr_size, ++ &nullb_device_attr_completion_nsec, ++ &nullb_device_attr_submit_queues, ++ &nullb_device_attr_home_node, ++ &nullb_device_attr_queue_mode, ++ &nullb_device_attr_blocksize, ++ &nullb_device_attr_irqmode, ++ &nullb_device_attr_hw_queue_depth, ++ &nullb_device_attr_index, ++ &nullb_device_attr_blocking, ++ &nullb_device_attr_use_per_node_hctx, ++ &nullb_device_attr_power, ++ &nullb_device_attr_memory_backed, ++ &nullb_device_attr_discard, ++ &nullb_device_attr_mbps, ++ &nullb_device_attr_cache_size, ++ &nullb_device_attr_badblocks, ++ &nullb_device_attr_zoned, ++ &nullb_device_attr_zone_size, ++ &nullb_device_attr_zone_capacity, ++ &nullb_device_attr_zone_nr_conv, ++ &nullb_device_attr_zone_max_open, ++ &nullb_device_attr_zone_max_active, ++ NULL, ++}; ++ ++static void nullb_device_release(struct config_item *item) ++{ ++ struct nullb_device *dev = to_nullb_device(item); ++ ++ null_free_device_storage(dev, false); ++ null_free_dev(dev); ++} ++ ++static struct configfs_item_operations nullb_device_ops = { ++ .release = nullb_device_release, ++}; ++ ++static const struct config_item_type nullb_device_type = { ++ .ct_item_ops = &nullb_device_ops, ++ .ct_attrs = nullb_device_attrs, ++ .ct_owner = THIS_MODULE, ++}; ++ ++static struct ++config_item *nullb_group_make_item(struct config_group *group, const char *name) ++{ ++ struct nullb_device *dev; ++ ++ dev = null_alloc_dev(); ++ if (!dev) ++ return ERR_PTR(-ENOMEM); ++ ++ config_item_init_type_name(&dev->item, name, &nullb_device_type); ++ ++ return &dev->item; ++} ++ ++static void ++nullb_group_drop_item(struct config_group *group, struct config_item *item) ++{ ++ struct nullb_device *dev = to_nullb_device(item); ++ ++ if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { ++ mutex_lock(&lock); ++ dev->power = false; ++ null_del_dev(dev->nullb); ++ mutex_unlock(&lock); ++ } ++ ++ config_item_put(item); ++} ++ ++static ssize_t memb_group_features_show(struct config_item *item, char *page) ++{ ++ return snprintf(page, PAGE_SIZE, ++ "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active\n"); ++} ++ ++CONFIGFS_ATTR_RO(memb_group_, features); ++ ++static struct configfs_attribute *nullb_group_attrs[] = { ++ &memb_group_attr_features, ++ NULL, ++}; ++ ++static struct configfs_group_operations nullb_group_ops = { ++ .make_item = nullb_group_make_item, ++ .drop_item = nullb_group_drop_item, ++}; ++ ++static const struct config_item_type nullb_group_type = { ++ .ct_group_ops = &nullb_group_ops, ++ .ct_attrs = nullb_group_attrs, ++ .ct_owner = THIS_MODULE, ++}; ++ ++static struct configfs_subsystem nullb_subsys = { ++ .su_group = { ++ .cg_item = { ++ .ci_namebuf = "nullb", ++ .ci_type = &nullb_group_type, ++ }, ++ }, ++}; ++ ++static inline int null_cache_active(struct nullb *nullb) ++{ ++ return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); ++} ++ ++static struct nullb_device *null_alloc_dev(void) ++{ ++ struct nullb_device *dev; ++ ++ dev = kzalloc(sizeof(*dev), GFP_KERNEL); ++ if (!dev) ++ return NULL; ++ INIT_RADIX_TREE(&dev->data, GFP_ATOMIC); ++ INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC); ++ if (badblocks_init(&dev->badblocks, 0)) { ++ kfree(dev); ++ return NULL; ++ } ++ ++ dev->size = g_gb * 1024; ++ dev->completion_nsec = g_completion_nsec; ++ dev->submit_queues = g_submit_queues; ++ dev->home_node = g_home_node; ++ dev->queue_mode = g_queue_mode; ++ dev->blocksize = g_bs; ++ dev->irqmode = g_irqmode; ++ dev->hw_queue_depth = g_hw_queue_depth; ++ dev->blocking = g_blocking; ++ dev->use_per_node_hctx = g_use_per_node_hctx; ++ dev->zoned = g_zoned; ++ dev->zone_size = g_zone_size; ++ dev->zone_capacity = g_zone_capacity; ++ dev->zone_nr_conv = g_zone_nr_conv; ++ dev->zone_max_open = g_zone_max_open; ++ dev->zone_max_active = g_zone_max_active; ++ return dev; ++} ++ ++static void null_free_dev(struct nullb_device *dev) ++{ ++ if (!dev) ++ return; ++ ++ null_free_zoned_dev(dev); ++ badblocks_exit(&dev->badblocks); ++ kfree(dev); ++} ++ ++static void put_tag(struct nullb_queue *nq, unsigned int tag) ++{ ++ clear_bit_unlock(tag, nq->tag_map); ++ ++ if (waitqueue_active(&nq->wait)) ++ wake_up(&nq->wait); ++} ++ ++static unsigned int get_tag(struct nullb_queue *nq) ++{ ++ unsigned int tag; ++ ++ do { ++ tag = find_first_zero_bit(nq->tag_map, nq->queue_depth); ++ if (tag >= nq->queue_depth) ++ return -1U; ++ } while (test_and_set_bit_lock(tag, nq->tag_map)); ++ ++ return tag; ++} ++ ++static void free_cmd(struct nullb_cmd *cmd) ++{ ++ put_tag(cmd->nq, cmd->tag); ++} ++ ++static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer); ++ ++static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) ++{ ++ struct nullb_cmd *cmd; ++ unsigned int tag; ++ ++ tag = get_tag(nq); ++ if (tag != -1U) { ++ cmd = &nq->cmds[tag]; ++ cmd->tag = tag; ++ cmd->error = BLK_STS_OK; ++ cmd->nq = nq; ++ if (nq->dev->irqmode == NULL_IRQ_TIMER) { ++ hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, ++ HRTIMER_MODE_REL); ++ cmd->timer.function = null_cmd_timer_expired; ++ } ++ return cmd; ++ } ++ ++ return NULL; ++} ++ ++static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) ++{ ++ struct nullb_cmd *cmd; ++ DEFINE_WAIT(wait); ++ ++ cmd = __alloc_cmd(nq); ++ if (cmd || !can_wait) ++ return cmd; ++ ++ do { ++ prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); ++ cmd = __alloc_cmd(nq); ++ if (cmd) ++ break; ++ ++ io_schedule(); ++ } while (1); ++ ++ finish_wait(&nq->wait, &wait); ++ return cmd; ++} ++ ++static void end_cmd(struct nullb_cmd *cmd) ++{ ++ int queue_mode = cmd->nq->dev->queue_mode; ++ ++ switch (queue_mode) { ++ case NULL_Q_MQ: ++ blk_mq_end_request(cmd->rq, cmd->error); ++ return; ++ case NULL_Q_BIO: ++ cmd->bio->bi_status = cmd->error; ++ bio_endio(cmd->bio); ++ break; ++ } ++ ++ free_cmd(cmd); ++} ++ ++static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) ++{ ++ end_cmd(container_of(timer, struct nullb_cmd, timer)); ++ ++ return HRTIMER_NORESTART; ++} ++ ++static void null_cmd_end_timer(struct nullb_cmd *cmd) ++{ ++ ktime_t kt = cmd->nq->dev->completion_nsec; ++ ++ hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL); ++} ++ ++static void null_complete_rq(struct request *rq) ++{ ++ end_cmd(blk_mq_rq_to_pdu(rq)); ++} ++ ++static struct nullb_page *null_alloc_page(gfp_t gfp_flags) ++{ ++ struct nullb_page *t_page; ++ ++ t_page = kmalloc(sizeof(struct nullb_page), gfp_flags); ++ if (!t_page) ++ goto out; ++ ++ t_page->page = alloc_pages(gfp_flags, 0); ++ if (!t_page->page) ++ goto out_freepage; ++ ++ memset(t_page->bitmap, 0, sizeof(t_page->bitmap)); ++ return t_page; ++out_freepage: ++ kfree(t_page); ++out: ++ return NULL; ++} ++ ++static void null_free_page(struct nullb_page *t_page) ++{ ++ __set_bit(NULLB_PAGE_FREE, t_page->bitmap); ++ if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap)) ++ return; ++ __free_page(t_page->page); ++ kfree(t_page); ++} ++ ++static bool null_page_empty(struct nullb_page *page) ++{ ++ int size = MAP_SZ - 2; ++ ++ return find_first_bit(page->bitmap, size) == size; ++} ++ ++static void null_free_sector(struct nullb *nullb, sector_t sector, ++ bool is_cache) ++{ ++ unsigned int sector_bit; ++ u64 idx; ++ struct nullb_page *t_page, *ret; ++ struct radix_tree_root *root; ++ ++ root = is_cache ? &nullb->dev->cache : &nullb->dev->data; ++ idx = sector >> PAGE_SECTORS_SHIFT; ++ sector_bit = (sector & SECTOR_MASK); ++ ++ t_page = radix_tree_lookup(root, idx); ++ if (t_page) { ++ __clear_bit(sector_bit, t_page->bitmap); ++ ++ if (null_page_empty(t_page)) { ++ ret = radix_tree_delete_item(root, idx, t_page); ++ WARN_ON(ret != t_page); ++ null_free_page(ret); ++ if (is_cache) ++ nullb->dev->curr_cache -= PAGE_SIZE; ++ } ++ } ++} ++ ++static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx, ++ struct nullb_page *t_page, bool is_cache) ++{ ++ struct radix_tree_root *root; ++ ++ root = is_cache ? &nullb->dev->cache : &nullb->dev->data; ++ ++ if (radix_tree_insert(root, idx, t_page)) { ++ null_free_page(t_page); ++ t_page = radix_tree_lookup(root, idx); ++ WARN_ON(!t_page || t_page->page->index != idx); ++ } else if (is_cache) ++ nullb->dev->curr_cache += PAGE_SIZE; ++ ++ return t_page; ++} ++ ++static void null_free_device_storage(struct nullb_device *dev, bool is_cache) ++{ ++ unsigned long pos = 0; ++ int nr_pages; ++ struct nullb_page *ret, *t_pages[FREE_BATCH]; ++ struct radix_tree_root *root; ++ ++ root = is_cache ? &dev->cache : &dev->data; ++ ++ do { ++ int i; ++ ++ nr_pages = radix_tree_gang_lookup(root, ++ (void **)t_pages, pos, FREE_BATCH); ++ ++ for (i = 0; i < nr_pages; i++) { ++ pos = t_pages[i]->page->index; ++ ret = radix_tree_delete_item(root, pos, t_pages[i]); ++ WARN_ON(ret != t_pages[i]); ++ null_free_page(ret); ++ } ++ ++ pos++; ++ } while (nr_pages == FREE_BATCH); ++ ++ if (is_cache) ++ dev->curr_cache = 0; ++} ++ ++static struct nullb_page *__null_lookup_page(struct nullb *nullb, ++ sector_t sector, bool for_write, bool is_cache) ++{ ++ unsigned int sector_bit; ++ u64 idx; ++ struct nullb_page *t_page; ++ struct radix_tree_root *root; ++ ++ idx = sector >> PAGE_SECTORS_SHIFT; ++ sector_bit = (sector & SECTOR_MASK); ++ ++ root = is_cache ? &nullb->dev->cache : &nullb->dev->data; ++ t_page = radix_tree_lookup(root, idx); ++ WARN_ON(t_page && t_page->page->index != idx); ++ ++ if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap))) ++ return t_page; ++ ++ return NULL; ++} ++ ++static struct nullb_page *null_lookup_page(struct nullb *nullb, ++ sector_t sector, bool for_write, bool ignore_cache) ++{ ++ struct nullb_page *page = NULL; ++ ++ if (!ignore_cache) ++ page = __null_lookup_page(nullb, sector, for_write, true); ++ if (page) ++ return page; ++ return __null_lookup_page(nullb, sector, for_write, false); ++} ++ ++static struct nullb_page *null_insert_page(struct nullb *nullb, ++ sector_t sector, bool ignore_cache) ++ __releases(&nullb->lock) ++ __acquires(&nullb->lock) ++{ ++ u64 idx; ++ struct nullb_page *t_page; ++ ++ t_page = null_lookup_page(nullb, sector, true, ignore_cache); ++ if (t_page) ++ return t_page; ++ ++ spin_unlock_irq(&nullb->lock); ++ ++ t_page = null_alloc_page(GFP_NOIO); ++ if (!t_page) ++ goto out_lock; ++ ++ if (radix_tree_preload(GFP_NOIO)) ++ goto out_freepage; ++ ++ spin_lock_irq(&nullb->lock); ++ idx = sector >> PAGE_SECTORS_SHIFT; ++ t_page->page->index = idx; ++ t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache); ++ radix_tree_preload_end(); ++ ++ return t_page; ++out_freepage: ++ null_free_page(t_page); ++out_lock: ++ spin_lock_irq(&nullb->lock); ++ return null_lookup_page(nullb, sector, true, ignore_cache); ++} ++ ++static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) ++{ ++ int i; ++ unsigned int offset; ++ u64 idx; ++ struct nullb_page *t_page, *ret; ++ void *dst, *src; ++ ++ idx = c_page->page->index; ++ ++ t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true); ++ ++ __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap); ++ if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) { ++ null_free_page(c_page); ++ if (t_page && null_page_empty(t_page)) { ++ ret = radix_tree_delete_item(&nullb->dev->data, ++ idx, t_page); ++ null_free_page(t_page); ++ } ++ return 0; ++ } ++ ++ if (!t_page) ++ return -ENOMEM; ++ ++ src = kmap_atomic(c_page->page); ++ dst = kmap_atomic(t_page->page); ++ ++ for (i = 0; i < PAGE_SECTORS; ++ i += (nullb->dev->blocksize >> SECTOR_SHIFT)) { ++ if (test_bit(i, c_page->bitmap)) { ++ offset = (i << SECTOR_SHIFT); ++ memcpy(dst + offset, src + offset, ++ nullb->dev->blocksize); ++ __set_bit(i, t_page->bitmap); ++ } ++ } ++ ++ kunmap_atomic(dst); ++ kunmap_atomic(src); ++ ++ ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page); ++ null_free_page(ret); ++ nullb->dev->curr_cache -= PAGE_SIZE; ++ ++ return 0; ++} ++ ++static int null_make_cache_space(struct nullb *nullb, unsigned long n) ++{ ++ int i, err, nr_pages; ++ struct nullb_page *c_pages[FREE_BATCH]; ++ unsigned long flushed = 0, one_round; ++ ++again: ++ if ((nullb->dev->cache_size * 1024 * 1024) > ++ nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0) ++ return 0; ++ ++ nr_pages = radix_tree_gang_lookup(&nullb->dev->cache, ++ (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH); ++ /* ++ * nullb_flush_cache_page could unlock before using the c_pages. To ++ * avoid race, we don't allow page free ++ */ ++ for (i = 0; i < nr_pages; i++) { ++ nullb->cache_flush_pos = c_pages[i]->page->index; ++ /* ++ * We found the page which is being flushed to disk by other ++ * threads ++ */ ++ if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap)) ++ c_pages[i] = NULL; ++ else ++ __set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap); ++ } ++ ++ one_round = 0; ++ for (i = 0; i < nr_pages; i++) { ++ if (c_pages[i] == NULL) ++ continue; ++ err = null_flush_cache_page(nullb, c_pages[i]); ++ if (err) ++ return err; ++ one_round++; ++ } ++ flushed += one_round << PAGE_SHIFT; ++ ++ if (n > flushed) { ++ if (nr_pages == 0) ++ nullb->cache_flush_pos = 0; ++ if (one_round == 0) { ++ /* give other threads a chance */ ++ spin_unlock_irq(&nullb->lock); ++ spin_lock_irq(&nullb->lock); ++ } ++ goto again; ++ } ++ return 0; ++} ++ ++static int copy_to_nullb(struct nullb *nullb, struct page *source, ++ unsigned int off, sector_t sector, size_t n, bool is_fua) ++{ ++ size_t temp, count = 0; ++ unsigned int offset; ++ struct nullb_page *t_page; ++ void *dst, *src; ++ ++ while (count < n) { ++ temp = min_t(size_t, nullb->dev->blocksize, n - count); ++ ++ if (null_cache_active(nullb) && !is_fua) ++ null_make_cache_space(nullb, PAGE_SIZE); ++ ++ offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; ++ t_page = null_insert_page(nullb, sector, ++ !null_cache_active(nullb) || is_fua); ++ if (!t_page) ++ return -ENOSPC; ++ ++ src = kmap_atomic(source); ++ dst = kmap_atomic(t_page->page); ++ memcpy(dst + offset, src + off + count, temp); ++ kunmap_atomic(dst); ++ kunmap_atomic(src); ++ ++ __set_bit(sector & SECTOR_MASK, t_page->bitmap); ++ ++ if (is_fua) ++ null_free_sector(nullb, sector, true); ++ ++ count += temp; ++ sector += temp >> SECTOR_SHIFT; ++ } ++ return 0; ++} ++ ++static int copy_from_nullb(struct nullb *nullb, struct page *dest, ++ unsigned int off, sector_t sector, size_t n) ++{ ++ size_t temp, count = 0; ++ unsigned int offset; ++ struct nullb_page *t_page; ++ void *dst, *src; ++ ++ while (count < n) { ++ temp = min_t(size_t, nullb->dev->blocksize, n - count); ++ ++ offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; ++ t_page = null_lookup_page(nullb, sector, false, ++ !null_cache_active(nullb)); ++ ++ dst = kmap_atomic(dest); ++ if (!t_page) { ++ memset(dst + off + count, 0, temp); ++ goto next; ++ } ++ src = kmap_atomic(t_page->page); ++ memcpy(dst + off + count, src + offset, temp); ++ kunmap_atomic(src); ++next: ++ kunmap_atomic(dst); ++ ++ count += temp; ++ sector += temp >> SECTOR_SHIFT; ++ } ++ return 0; ++} ++ ++static void nullb_fill_pattern(struct nullb *nullb, struct page *page, ++ unsigned int len, unsigned int off) ++{ ++ void *dst; ++ ++ dst = kmap_atomic(page); ++ memset(dst + off, 0xFF, len); ++ kunmap_atomic(dst); ++} ++ ++static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n) ++{ ++ size_t temp; ++ ++ spin_lock_irq(&nullb->lock); ++ while (n > 0) { ++ temp = min_t(size_t, n, nullb->dev->blocksize); ++ null_free_sector(nullb, sector, false); ++ if (null_cache_active(nullb)) ++ null_free_sector(nullb, sector, true); ++ sector += temp >> SECTOR_SHIFT; ++ n -= temp; ++ } ++ spin_unlock_irq(&nullb->lock); ++} ++ ++static int null_handle_flush(struct nullb *nullb) ++{ ++ int err; ++ ++ if (!null_cache_active(nullb)) ++ return 0; ++ ++ spin_lock_irq(&nullb->lock); ++ while (true) { ++ err = null_make_cache_space(nullb, ++ nullb->dev->cache_size * 1024 * 1024); ++ if (err || nullb->dev->curr_cache == 0) ++ break; ++ } ++ ++ WARN_ON(!radix_tree_empty(&nullb->dev->cache)); ++ spin_unlock_irq(&nullb->lock); ++ return err; ++} ++ ++static int null_transfer(struct nullb *nullb, struct page *page, ++ unsigned int len, unsigned int off, bool is_write, sector_t sector, ++ bool is_fua) ++{ ++ struct nullb_device *dev = nullb->dev; ++ unsigned int valid_len = len; ++ int err = 0; ++ ++ if (!is_write) { ++ if (dev->zoned) ++ valid_len = null_zone_valid_read_len(nullb, ++ sector, len); ++ ++ if (valid_len) { ++ err = copy_from_nullb(nullb, page, off, ++ sector, valid_len); ++ off += valid_len; ++ len -= valid_len; ++ } ++ ++ if (len) ++ nullb_fill_pattern(nullb, page, len, off); ++ flush_dcache_page(page); ++ } else { ++ flush_dcache_page(page); ++ err = copy_to_nullb(nullb, page, off, sector, len, is_fua); ++ } ++ ++ return err; ++} ++ ++static int null_handle_rq(struct nullb_cmd *cmd) ++{ ++ struct request *rq = cmd->rq; ++ struct nullb *nullb = cmd->nq->dev->nullb; ++ int err; ++ unsigned int len; ++ sector_t sector; ++ struct req_iterator iter; ++ struct bio_vec bvec; ++ ++ sector = blk_rq_pos(rq); ++ ++ if (req_op(rq) == REQ_OP_DISCARD) { ++ null_handle_discard(nullb, sector, blk_rq_bytes(rq)); ++ return 0; ++ } ++ ++ spin_lock_irq(&nullb->lock); ++ rq_for_each_segment(bvec, rq, iter) { ++ len = bvec.bv_len; ++ err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, ++ op_is_write(req_op(rq)), sector, ++ rq->cmd_flags & REQ_FUA); ++ if (err) { ++ spin_unlock_irq(&nullb->lock); ++ return err; ++ } ++ sector += len >> SECTOR_SHIFT; ++ } ++ spin_unlock_irq(&nullb->lock); ++ ++ return 0; ++} ++ ++static int null_handle_bio(struct nullb_cmd *cmd) ++{ ++ struct bio *bio = cmd->bio; ++ struct nullb *nullb = cmd->nq->dev->nullb; ++ int err; ++ unsigned int len; ++ sector_t sector; ++ struct bio_vec bvec; ++ struct bvec_iter iter; ++ ++ sector = bio->bi_iter.bi_sector; ++ ++ if (bio_op(bio) == REQ_OP_DISCARD) { ++ null_handle_discard(nullb, sector, ++ bio_sectors(bio) << SECTOR_SHIFT); ++ return 0; ++ } ++ ++ spin_lock_irq(&nullb->lock); ++ bio_for_each_segment(bvec, bio, iter) { ++ len = bvec.bv_len; ++ err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, ++ op_is_write(bio_op(bio)), sector, ++ bio->bi_opf & REQ_FUA); ++ if (err) { ++ spin_unlock_irq(&nullb->lock); ++ return err; ++ } ++ sector += len >> SECTOR_SHIFT; ++ } ++ spin_unlock_irq(&nullb->lock); ++ return 0; ++} ++ ++static void null_stop_queue(struct nullb *nullb) ++{ ++ struct request_queue *q = nullb->q; ++ ++ if (nullb->dev->queue_mode == NULL_Q_MQ) ++ blk_mq_stop_hw_queues(q); ++} ++ ++static void null_restart_queue_async(struct nullb *nullb) ++{ ++ struct request_queue *q = nullb->q; ++ ++ if (nullb->dev->queue_mode == NULL_Q_MQ) ++ blk_mq_start_stopped_hw_queues(q, true); ++} ++ ++static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) ++{ ++ struct nullb_device *dev = cmd->nq->dev; ++ struct nullb *nullb = dev->nullb; ++ blk_status_t sts = BLK_STS_OK; ++ struct request *rq = cmd->rq; ++ ++ if (!hrtimer_active(&nullb->bw_timer)) ++ hrtimer_restart(&nullb->bw_timer); ++ ++ if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) { ++ null_stop_queue(nullb); ++ /* race with timer */ ++ if (atomic_long_read(&nullb->cur_bytes) > 0) ++ null_restart_queue_async(nullb); ++ /* requeue request */ ++ sts = BLK_STS_DEV_RESOURCE; ++ } ++ return sts; ++} ++ ++static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, ++ sector_t sector, ++ sector_t nr_sectors) ++{ ++ struct badblocks *bb = &cmd->nq->dev->badblocks; ++ sector_t first_bad; ++ int bad_sectors; ++ ++ if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors)) ++ return BLK_STS_IOERR; ++ ++ return BLK_STS_OK; ++} ++ ++static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, ++ enum req_opf op) ++{ ++ struct nullb_device *dev = cmd->nq->dev; ++ int err; ++ ++ if (dev->queue_mode == NULL_Q_BIO) ++ err = null_handle_bio(cmd); ++ else ++ err = null_handle_rq(cmd); ++ ++ return errno_to_blk_status(err); ++} ++ ++static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) ++{ ++ struct nullb_device *dev = cmd->nq->dev; ++ struct bio *bio; ++ ++ if (dev->memory_backed) ++ return; ++ ++ if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) { ++ zero_fill_bio(cmd->bio); ++ } else if (req_op(cmd->rq) == REQ_OP_READ) { ++ __rq_for_each_bio(bio, cmd->rq) ++ zero_fill_bio(bio); ++ } ++} ++ ++static inline void nullb_complete_cmd(struct nullb_cmd *cmd) ++{ ++ /* ++ * Since root privileges are required to configure the null_blk ++ * driver, it is fine that this driver does not initialize the ++ * data buffers of read commands. Zero-initialize these buffers ++ * anyway if KMSAN is enabled to prevent that KMSAN complains ++ * about null_blk not initializing read data buffers. ++ */ ++ if (IS_ENABLED(CONFIG_KMSAN)) ++ nullb_zero_read_cmd_buffer(cmd); ++ ++ /* Complete IO by inline, softirq or timer */ ++ switch (cmd->nq->dev->irqmode) { ++ case NULL_IRQ_SOFTIRQ: ++ switch (cmd->nq->dev->queue_mode) { ++ case NULL_Q_MQ: ++ blk_mq_complete_request(cmd->rq); ++ break; ++ case NULL_Q_BIO: ++ /* ++ * XXX: no proper submitting cpu information available. ++ */ ++ end_cmd(cmd); ++ break; ++ } ++ break; ++ case NULL_IRQ_NONE: ++ end_cmd(cmd); ++ break; ++ case NULL_IRQ_TIMER: ++ null_cmd_end_timer(cmd); ++ break; ++ } ++} ++ ++blk_status_t null_process_cmd(struct nullb_cmd *cmd, ++ enum req_opf op, sector_t sector, ++ unsigned int nr_sectors) ++{ ++ struct nullb_device *dev = cmd->nq->dev; ++ blk_status_t ret; ++ ++ if (dev->badblocks.shift != -1) { ++ ret = null_handle_badblocks(cmd, sector, nr_sectors); ++ if (ret != BLK_STS_OK) ++ return ret; ++ } ++ ++ if (dev->memory_backed) ++ return null_handle_memory_backed(cmd, op); ++ ++ return BLK_STS_OK; ++} ++ ++static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, ++ sector_t nr_sectors, enum req_opf op) ++{ ++ struct nullb_device *dev = cmd->nq->dev; ++ struct nullb *nullb = dev->nullb; ++ blk_status_t sts; ++ ++ if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { ++ sts = null_handle_throttled(cmd); ++ if (sts != BLK_STS_OK) ++ return sts; ++ } ++ ++ if (op == REQ_OP_FLUSH) { ++ cmd->error = errno_to_blk_status(null_handle_flush(nullb)); ++ goto out; ++ } ++ ++ if (dev->zoned) ++ sts = null_process_zoned_cmd(cmd, op, sector, nr_sectors); ++ else ++ sts = null_process_cmd(cmd, op, sector, nr_sectors); ++ ++ /* Do not overwrite errors (e.g. timeout errors) */ ++ if (cmd->error == BLK_STS_OK) ++ cmd->error = sts; ++ ++out: ++ nullb_complete_cmd(cmd); ++ return BLK_STS_OK; ++} ++ ++static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer) ++{ ++ struct nullb *nullb = container_of(timer, struct nullb, bw_timer); ++ ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); ++ unsigned int mbps = nullb->dev->mbps; ++ ++ if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps)) ++ return HRTIMER_NORESTART; ++ ++ atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps)); ++ null_restart_queue_async(nullb); ++ ++ hrtimer_forward_now(&nullb->bw_timer, timer_interval); ++ ++ return HRTIMER_RESTART; ++} ++ ++static void nullb_setup_bwtimer(struct nullb *nullb) ++{ ++ ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); ++ ++ hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ nullb->bw_timer.function = nullb_bwtimer_fn; ++ atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps)); ++ hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); ++} ++ ++static struct nullb_queue *nullb_to_queue(struct nullb *nullb) ++{ ++ int index = 0; ++ ++ if (nullb->nr_queues != 1) ++ index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues); ++ ++ return &nullb->queues[index]; ++} ++ ++static blk_qc_t null_submit_bio(struct bio *bio) ++{ ++ sector_t sector = bio->bi_iter.bi_sector; ++ sector_t nr_sectors = bio_sectors(bio); ++ struct nullb *nullb = bio->bi_disk->private_data; ++ struct nullb_queue *nq = nullb_to_queue(nullb); ++ struct nullb_cmd *cmd; ++ ++ cmd = alloc_cmd(nq, 1); ++ cmd->bio = bio; ++ ++ null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio)); ++ return BLK_QC_T_NONE; ++} ++ ++static bool should_timeout_request(struct request *rq) ++{ ++#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION ++ if (g_timeout_str[0]) ++ return should_fail(&null_timeout_attr, 1); ++#endif ++ return false; ++} ++ ++static bool should_requeue_request(struct request *rq) ++{ ++#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION ++ if (g_requeue_str[0]) ++ return should_fail(&null_requeue_attr, 1); ++#endif ++ return false; ++} ++ ++static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) ++{ ++ struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); ++ ++ pr_info("rq %p timed out\n", rq); ++ ++ /* ++ * If the device is marked as blocking (i.e. memory backed or zoned ++ * device), the submission path may be blocked waiting for resources ++ * and cause real timeouts. For these real timeouts, the submission ++ * path will complete the request using blk_mq_complete_request(). ++ * Only fake timeouts need to execute blk_mq_complete_request() here. ++ */ ++ cmd->error = BLK_STS_TIMEOUT; ++ if (cmd->fake_timeout) ++ blk_mq_complete_request(rq); ++ return BLK_EH_DONE; ++} ++ ++static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, ++ const struct blk_mq_queue_data *bd) ++{ ++ struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); ++ struct nullb_queue *nq = hctx->driver_data; ++ sector_t nr_sectors = blk_rq_sectors(bd->rq); ++ sector_t sector = blk_rq_pos(bd->rq); ++ ++ might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); ++ ++ if (nq->dev->irqmode == NULL_IRQ_TIMER) { ++ hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ cmd->timer.function = null_cmd_timer_expired; ++ } ++ cmd->rq = bd->rq; ++ cmd->error = BLK_STS_OK; ++ cmd->nq = nq; ++ cmd->fake_timeout = should_timeout_request(bd->rq) || ++ blk_should_fake_timeout(bd->rq->q); ++ ++ blk_mq_start_request(bd->rq); ++ ++ if (should_requeue_request(bd->rq)) { ++ /* ++ * Alternate between hitting the core BUSY path, and the ++ * driver driven requeue path ++ */ ++ nq->requeue_selection++; ++ if (nq->requeue_selection & 1) ++ return BLK_STS_RESOURCE; ++ else { ++ blk_mq_requeue_request(bd->rq, true); ++ return BLK_STS_OK; ++ } ++ } ++ if (cmd->fake_timeout) ++ return BLK_STS_OK; ++ ++ return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq)); ++} ++ ++static void cleanup_queue(struct nullb_queue *nq) ++{ ++ kfree(nq->tag_map); ++ kfree(nq->cmds); ++} ++ ++static void cleanup_queues(struct nullb *nullb) ++{ ++ int i; ++ ++ for (i = 0; i < nullb->nr_queues; i++) ++ cleanup_queue(&nullb->queues[i]); ++ ++ kfree(nullb->queues); ++} ++ ++static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) ++{ ++ struct nullb_queue *nq = hctx->driver_data; ++ struct nullb *nullb = nq->dev->nullb; ++ ++ nullb->nr_queues--; ++} ++ ++static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) ++{ ++ init_waitqueue_head(&nq->wait); ++ nq->queue_depth = nullb->queue_depth; ++ nq->dev = nullb->dev; ++} ++ ++static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, ++ unsigned int hctx_idx) ++{ ++ struct nullb *nullb = hctx->queue->queuedata; ++ struct nullb_queue *nq; ++ ++#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION ++ if (g_init_hctx_str[0] && should_fail(&null_init_hctx_attr, 1)) ++ return -EFAULT; ++#endif ++ ++ nq = &nullb->queues[hctx_idx]; ++ hctx->driver_data = nq; ++ null_init_queue(nullb, nq); ++ nullb->nr_queues++; ++ ++ return 0; ++} ++ ++static const struct blk_mq_ops null_mq_ops = { ++ .queue_rq = null_queue_rq, ++ .complete = null_complete_rq, ++ .timeout = null_timeout_rq, ++ .init_hctx = null_init_hctx, ++ .exit_hctx = null_exit_hctx, ++}; ++ ++static void null_del_dev(struct nullb *nullb) ++{ ++ struct nullb_device *dev; ++ ++ if (!nullb) ++ return; ++ ++ dev = nullb->dev; ++ ++ ida_simple_remove(&nullb_indexes, nullb->index); ++ ++ list_del_init(&nullb->list); ++ ++ del_gendisk(nullb->disk); ++ ++ if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { ++ hrtimer_cancel(&nullb->bw_timer); ++ atomic_long_set(&nullb->cur_bytes, LONG_MAX); ++ null_restart_queue_async(nullb); ++ } ++ ++ blk_cleanup_queue(nullb->q); ++ if (dev->queue_mode == NULL_Q_MQ && ++ nullb->tag_set == &nullb->__tag_set) ++ blk_mq_free_tag_set(nullb->tag_set); ++ put_disk(nullb->disk); ++ cleanup_queues(nullb); ++ if (null_cache_active(nullb)) ++ null_free_device_storage(nullb->dev, true); ++ kfree(nullb); ++ dev->nullb = NULL; ++} ++ ++static void null_config_discard(struct nullb *nullb) ++{ ++ if (nullb->dev->discard == false) ++ return; ++ ++ if (nullb->dev->zoned) { ++ nullb->dev->discard = false; ++ pr_info("discard option is ignored in zoned mode\n"); ++ return; ++ } ++ ++ nullb->q->limits.discard_granularity = nullb->dev->blocksize; ++ nullb->q->limits.discard_alignment = nullb->dev->blocksize; ++ blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); ++ blk_queue_flag_set(QUEUE_FLAG_DISCARD, nullb->q); ++} ++ ++static const struct block_device_operations null_bio_ops = { ++ .owner = THIS_MODULE, ++ .submit_bio = null_submit_bio, ++ .report_zones = null_report_zones, ++}; ++ ++static const struct block_device_operations null_rq_ops = { ++ .owner = THIS_MODULE, ++ .report_zones = null_report_zones, ++}; ++ ++static int setup_commands(struct nullb_queue *nq) ++{ ++ struct nullb_cmd *cmd; ++ int i, tag_size; ++ ++ nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL); ++ if (!nq->cmds) ++ return -ENOMEM; ++ ++ tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG; ++ nq->tag_map = kcalloc(tag_size, sizeof(unsigned long), GFP_KERNEL); ++ if (!nq->tag_map) { ++ kfree(nq->cmds); ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < nq->queue_depth; i++) { ++ cmd = &nq->cmds[i]; ++ cmd->tag = -1U; ++ } ++ ++ return 0; ++} ++ ++static int setup_queues(struct nullb *nullb) ++{ ++ nullb->queues = kcalloc(nr_cpu_ids, sizeof(struct nullb_queue), ++ GFP_KERNEL); ++ if (!nullb->queues) ++ return -ENOMEM; ++ ++ nullb->queue_depth = nullb->dev->hw_queue_depth; ++ ++ return 0; ++} ++ ++static int init_driver_queues(struct nullb *nullb) ++{ ++ struct nullb_queue *nq; ++ int i, ret = 0; ++ ++ for (i = 0; i < nullb->dev->submit_queues; i++) { ++ nq = &nullb->queues[i]; ++ ++ null_init_queue(nullb, nq); ++ ++ ret = setup_commands(nq); ++ if (ret) ++ return ret; ++ nullb->nr_queues++; ++ } ++ return 0; ++} ++ ++static int null_gendisk_register(struct nullb *nullb) ++{ ++ sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT; ++ struct gendisk *disk; ++ ++ disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node); ++ if (!disk) ++ return -ENOMEM; ++ set_capacity(disk, size); ++ ++ disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; ++ disk->major = null_major; ++ disk->first_minor = nullb->index; ++ if (queue_is_mq(nullb->q)) ++ disk->fops = &null_rq_ops; ++ else ++ disk->fops = &null_bio_ops; ++ disk->private_data = nullb; ++ disk->queue = nullb->q; ++ strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); ++ ++ if (nullb->dev->zoned) { ++ int ret = null_register_zoned_dev(nullb); ++ ++ if (ret) ++ return ret; ++ } ++ ++ add_disk(disk); ++ return 0; ++} ++ ++static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) ++{ ++ set->ops = &null_mq_ops; ++ set->nr_hw_queues = nullb ? nullb->dev->submit_queues : ++ g_submit_queues; ++ set->queue_depth = nullb ? nullb->dev->hw_queue_depth : ++ g_hw_queue_depth; ++ set->numa_node = nullb ? nullb->dev->home_node : g_home_node; ++ set->cmd_size = sizeof(struct nullb_cmd); ++ set->flags = BLK_MQ_F_SHOULD_MERGE; ++ if (g_no_sched) ++ set->flags |= BLK_MQ_F_NO_SCHED; ++ if (g_shared_tag_bitmap) ++ set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; ++ set->driver_data = NULL; ++ ++ if ((nullb && nullb->dev->blocking) || g_blocking) ++ set->flags |= BLK_MQ_F_BLOCKING; ++ ++ return blk_mq_alloc_tag_set(set); ++} ++ ++static int null_validate_conf(struct nullb_device *dev) ++{ ++ dev->blocksize = round_down(dev->blocksize, 512); ++ dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); ++ ++ if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { ++ if (dev->submit_queues != nr_online_nodes) ++ dev->submit_queues = nr_online_nodes; ++ } else if (dev->submit_queues > nr_cpu_ids) ++ dev->submit_queues = nr_cpu_ids; ++ else if (dev->submit_queues == 0) ++ dev->submit_queues = 1; ++ ++ dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); ++ dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); ++ ++ /* Do memory allocation, so set blocking */ ++ if (dev->memory_backed) ++ dev->blocking = true; ++ else /* cache is meaningless */ ++ dev->cache_size = 0; ++ dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024, ++ dev->cache_size); ++ dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); ++ /* can not stop a queue */ ++ if (dev->queue_mode == NULL_Q_BIO) ++ dev->mbps = 0; ++ ++ if (dev->zoned && ++ (!dev->zone_size || !is_power_of_2(dev->zone_size))) { ++ pr_err("zone_size must be power-of-two\n"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION ++static bool __null_setup_fault(struct fault_attr *attr, char *str) ++{ ++ if (!str[0]) ++ return true; ++ ++ if (!setup_fault_attr(attr, str)) ++ return false; ++ ++ attr->verbose = 0; ++ return true; ++} ++#endif ++ ++static bool null_setup_fault(void) ++{ ++#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION ++ if (!__null_setup_fault(&null_timeout_attr, g_timeout_str)) ++ return false; ++ if (!__null_setup_fault(&null_requeue_attr, g_requeue_str)) ++ return false; ++ if (!__null_setup_fault(&null_init_hctx_attr, g_init_hctx_str)) ++ return false; ++#endif ++ return true; ++} ++ ++static int null_add_dev(struct nullb_device *dev) ++{ ++ struct nullb *nullb; ++ int rv; ++ ++ rv = null_validate_conf(dev); ++ if (rv) ++ return rv; ++ ++ nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); ++ if (!nullb) { ++ rv = -ENOMEM; ++ goto out; ++ } ++ nullb->dev = dev; ++ dev->nullb = nullb; ++ ++ spin_lock_init(&nullb->lock); ++ ++ rv = setup_queues(nullb); ++ if (rv) ++ goto out_free_nullb; ++ ++ if (dev->queue_mode == NULL_Q_MQ) { ++ if (shared_tags) { ++ nullb->tag_set = &tag_set; ++ rv = 0; ++ } else { ++ nullb->tag_set = &nullb->__tag_set; ++ rv = null_init_tag_set(nullb, nullb->tag_set); ++ } ++ ++ if (rv) ++ goto out_cleanup_queues; ++ ++ if (!null_setup_fault()) ++ goto out_cleanup_queues; ++ ++ nullb->tag_set->timeout = 5 * HZ; ++ nullb->q = blk_mq_init_queue_data(nullb->tag_set, nullb); ++ if (IS_ERR(nullb->q)) { ++ rv = -ENOMEM; ++ goto out_cleanup_tags; ++ } ++ } else if (dev->queue_mode == NULL_Q_BIO) { ++ nullb->q = blk_alloc_queue(dev->home_node); ++ if (!nullb->q) { ++ rv = -ENOMEM; ++ goto out_cleanup_queues; ++ } ++ rv = init_driver_queues(nullb); ++ if (rv) ++ goto out_cleanup_blk_queue; ++ } ++ ++ if (dev->mbps) { ++ set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); ++ nullb_setup_bwtimer(nullb); ++ } ++ ++ if (dev->cache_size > 0) { ++ set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); ++ blk_queue_write_cache(nullb->q, true, true); ++ } ++ ++ if (dev->zoned) { ++ rv = null_init_zoned_dev(dev, nullb->q); ++ if (rv) ++ goto out_cleanup_blk_queue; ++ } ++ ++ nullb->q->queuedata = nullb; ++ blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); ++ blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); ++ ++ mutex_lock(&lock); ++ rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); ++ if (rv < 0) { ++ mutex_unlock(&lock); ++ goto out_cleanup_zone; ++ } ++ nullb->index = rv; ++ dev->index = rv; ++ mutex_unlock(&lock); ++ ++ blk_queue_logical_block_size(nullb->q, dev->blocksize); ++ blk_queue_physical_block_size(nullb->q, dev->blocksize); ++ ++ null_config_discard(nullb); ++ ++ sprintf(nullb->disk_name, "nullb%d", nullb->index); ++ ++ rv = null_gendisk_register(nullb); ++ if (rv) ++ goto out_ida_free; ++ ++ mutex_lock(&lock); ++ list_add_tail(&nullb->list, &nullb_list); ++ mutex_unlock(&lock); ++ ++ return 0; ++ ++out_ida_free: ++ ida_free(&nullb_indexes, nullb->index); ++out_cleanup_zone: ++ null_free_zoned_dev(dev); ++out_cleanup_blk_queue: ++ blk_cleanup_queue(nullb->q); ++out_cleanup_tags: ++ if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) ++ blk_mq_free_tag_set(nullb->tag_set); ++out_cleanup_queues: ++ cleanup_queues(nullb); ++out_free_nullb: ++ kfree(nullb); ++ dev->nullb = NULL; ++out: ++ return rv; ++} ++ ++static int __init null_init(void) ++{ ++ int ret = 0; ++ unsigned int i; ++ struct nullb *nullb; ++ struct nullb_device *dev; ++ ++ if (g_bs > PAGE_SIZE) { ++ pr_warn("invalid block size\n"); ++ pr_warn("defaults block size to %lu\n", PAGE_SIZE); ++ g_bs = PAGE_SIZE; ++ } ++ ++ if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) { ++ pr_err("invalid home_node value\n"); ++ g_home_node = NUMA_NO_NODE; ++ } ++ ++ if (g_queue_mode == NULL_Q_RQ) { ++ pr_err("legacy IO path no longer available\n"); ++ return -EINVAL; ++ } ++ if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { ++ if (g_submit_queues != nr_online_nodes) { ++ pr_warn("submit_queues param is set to %u.\n", ++ nr_online_nodes); ++ g_submit_queues = nr_online_nodes; ++ } ++ } else if (g_submit_queues > nr_cpu_ids) ++ g_submit_queues = nr_cpu_ids; ++ else if (g_submit_queues <= 0) ++ g_submit_queues = 1; ++ ++ if (g_queue_mode == NULL_Q_MQ && shared_tags) { ++ ret = null_init_tag_set(NULL, &tag_set); ++ if (ret) ++ return ret; ++ } ++ ++ config_group_init(&nullb_subsys.su_group); ++ mutex_init(&nullb_subsys.su_mutex); ++ ++ ret = configfs_register_subsystem(&nullb_subsys); ++ if (ret) ++ goto err_tagset; ++ ++ mutex_init(&lock); ++ ++ null_major = register_blkdev(0, "nullb"); ++ if (null_major < 0) { ++ ret = null_major; ++ goto err_conf; ++ } ++ ++ for (i = 0; i < nr_devices; i++) { ++ dev = null_alloc_dev(); ++ if (!dev) { ++ ret = -ENOMEM; ++ goto err_dev; ++ } ++ ret = null_add_dev(dev); ++ if (ret) { ++ null_free_dev(dev); ++ goto err_dev; ++ } ++ } ++ ++ pr_info("module loaded\n"); ++ return 0; ++ ++err_dev: ++ while (!list_empty(&nullb_list)) { ++ nullb = list_entry(nullb_list.next, struct nullb, list); ++ dev = nullb->dev; ++ null_del_dev(nullb); ++ null_free_dev(dev); ++ } ++ unregister_blkdev(null_major, "nullb"); ++err_conf: ++ configfs_unregister_subsystem(&nullb_subsys); ++err_tagset: ++ if (g_queue_mode == NULL_Q_MQ && shared_tags) ++ blk_mq_free_tag_set(&tag_set); ++ return ret; ++} ++ ++static void __exit null_exit(void) ++{ ++ struct nullb *nullb; ++ ++ configfs_unregister_subsystem(&nullb_subsys); ++ ++ unregister_blkdev(null_major, "nullb"); ++ ++ mutex_lock(&lock); ++ while (!list_empty(&nullb_list)) { ++ struct nullb_device *dev; ++ ++ nullb = list_entry(nullb_list.next, struct nullb, list); ++ dev = nullb->dev; ++ null_del_dev(nullb); ++ null_free_dev(dev); ++ } ++ mutex_unlock(&lock); ++ ++ if (g_queue_mode == NULL_Q_MQ && shared_tags) ++ blk_mq_free_tag_set(&tag_set); ++} ++ ++module_init(null_init); ++module_exit(null_exit); ++ ++MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>"); ++MODULE_LICENSE("GPL"); +diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h +new file mode 100644 +index 0000000000000..7de703f28617b +--- /dev/null ++++ b/drivers/block/null_blk/null_blk.h +@@ -0,0 +1,137 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef __BLK_NULL_BLK_H ++#define __BLK_NULL_BLK_H ++ ++#undef pr_fmt ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++ ++#include <linux/blkdev.h> ++#include <linux/slab.h> ++#include <linux/blk-mq.h> ++#include <linux/hrtimer.h> ++#include <linux/configfs.h> ++#include <linux/badblocks.h> ++#include <linux/fault-inject.h> ++ ++struct nullb_cmd { ++ struct request *rq; ++ struct bio *bio; ++ unsigned int tag; ++ blk_status_t error; ++ struct nullb_queue *nq; ++ struct hrtimer timer; ++ bool fake_timeout; ++}; ++ ++struct nullb_queue { ++ unsigned long *tag_map; ++ wait_queue_head_t wait; ++ unsigned int queue_depth; ++ struct nullb_device *dev; ++ unsigned int requeue_selection; ++ ++ struct nullb_cmd *cmds; ++}; ++ ++struct nullb_device { ++ struct nullb *nullb; ++ struct config_item item; ++ struct radix_tree_root data; /* data stored in the disk */ ++ struct radix_tree_root cache; /* disk cache data */ ++ unsigned long flags; /* device flags */ ++ unsigned int curr_cache; ++ struct badblocks badblocks; ++ ++ unsigned int nr_zones; ++ unsigned int nr_zones_imp_open; ++ unsigned int nr_zones_exp_open; ++ unsigned int nr_zones_closed; ++ struct blk_zone *zones; ++ sector_t zone_size_sects; ++ spinlock_t zone_lock; ++ unsigned long *zone_locks; ++ ++ unsigned long size; /* device size in MB */ ++ unsigned long completion_nsec; /* time in ns to complete a request */ ++ unsigned long cache_size; /* disk cache size in MB */ ++ unsigned long zone_size; /* zone size in MB if device is zoned */ ++ unsigned long zone_capacity; /* zone capacity in MB if device is zoned */ ++ unsigned int zone_nr_conv; /* number of conventional zones */ ++ unsigned int zone_max_open; /* max number of open zones */ ++ unsigned int zone_max_active; /* max number of active zones */ ++ unsigned int submit_queues; /* number of submission queues */ ++ unsigned int home_node; /* home node for the device */ ++ unsigned int queue_mode; /* block interface */ ++ unsigned int blocksize; /* block size */ ++ unsigned int irqmode; /* IRQ completion handler */ ++ unsigned int hw_queue_depth; /* queue depth */ ++ unsigned int index; /* index of the disk, only valid with a disk */ ++ unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ ++ bool blocking; /* blocking blk-mq device */ ++ bool use_per_node_hctx; /* use per-node allocation for hardware context */ ++ bool power; /* power on/off the device */ ++ bool memory_backed; /* if data is stored in memory */ ++ bool discard; /* if support discard */ ++ bool zoned; /* if device is zoned */ ++}; ++ ++struct nullb { ++ struct nullb_device *dev; ++ struct list_head list; ++ unsigned int index; ++ struct request_queue *q; ++ struct gendisk *disk; ++ struct blk_mq_tag_set *tag_set; ++ struct blk_mq_tag_set __tag_set; ++ unsigned int queue_depth; ++ atomic_long_t cur_bytes; ++ struct hrtimer bw_timer; ++ unsigned long cache_flush_pos; ++ spinlock_t lock; ++ ++ struct nullb_queue *queues; ++ unsigned int nr_queues; ++ char disk_name[DISK_NAME_LEN]; ++}; ++ ++blk_status_t null_process_cmd(struct nullb_cmd *cmd, ++ enum req_opf op, sector_t sector, ++ unsigned int nr_sectors); ++ ++#ifdef CONFIG_BLK_DEV_ZONED ++int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q); ++int null_register_zoned_dev(struct nullb *nullb); ++void null_free_zoned_dev(struct nullb_device *dev); ++int null_report_zones(struct gendisk *disk, sector_t sector, ++ unsigned int nr_zones, report_zones_cb cb, void *data); ++blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, ++ enum req_opf op, sector_t sector, ++ sector_t nr_sectors); ++size_t null_zone_valid_read_len(struct nullb *nullb, ++ sector_t sector, unsigned int len); ++#else ++static inline int null_init_zoned_dev(struct nullb_device *dev, ++ struct request_queue *q) ++{ ++ pr_err("CONFIG_BLK_DEV_ZONED not enabled\n"); ++ return -EINVAL; ++} ++static inline int null_register_zoned_dev(struct nullb *nullb) ++{ ++ return -ENODEV; ++} ++static inline void null_free_zoned_dev(struct nullb_device *dev) {} ++static inline blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, ++ enum req_opf op, sector_t sector, sector_t nr_sectors) ++{ ++ return BLK_STS_NOTSUPP; ++} ++static inline size_t null_zone_valid_read_len(struct nullb *nullb, ++ sector_t sector, ++ unsigned int len) ++{ ++ return len; ++} ++#define null_report_zones NULL ++#endif /* CONFIG_BLK_DEV_ZONED */ ++#endif /* __NULL_BLK_H */ +diff --git a/drivers/block/null_blk/trace.c b/drivers/block/null_blk/trace.c +new file mode 100644 +index 0000000000000..3711cba160715 +--- /dev/null ++++ b/drivers/block/null_blk/trace.c +@@ -0,0 +1,21 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * null_blk trace related helpers. ++ * ++ * Copyright (C) 2020 Western Digital Corporation or its affiliates. ++ */ ++#include "trace.h" ++ ++/* ++ * Helper to use for all null_blk traces to extract disk name. ++ */ ++const char *nullb_trace_disk_name(struct trace_seq *p, char *name) ++{ ++ const char *ret = trace_seq_buffer_ptr(p); ++ ++ if (name && *name) ++ trace_seq_printf(p, "disk=%s, ", name); ++ trace_seq_putc(p, 0); ++ ++ return ret; ++} +diff --git a/drivers/block/null_blk/trace.h b/drivers/block/null_blk/trace.h +new file mode 100644 +index 0000000000000..ce3b430e88c57 +--- /dev/null ++++ b/drivers/block/null_blk/trace.h +@@ -0,0 +1,79 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * null_blk device driver tracepoints. ++ * ++ * Copyright (C) 2020 Western Digital Corporation or its affiliates. ++ */ ++ ++#undef TRACE_SYSTEM ++#define TRACE_SYSTEM nullb ++ ++#if !defined(_TRACE_NULLB_H) || defined(TRACE_HEADER_MULTI_READ) ++#define _TRACE_NULLB_H ++ ++#include <linux/tracepoint.h> ++#include <linux/trace_seq.h> ++ ++#include "null_blk.h" ++ ++const char *nullb_trace_disk_name(struct trace_seq *p, char *name); ++ ++#define __print_disk_name(name) nullb_trace_disk_name(p, name) ++ ++#ifndef TRACE_HEADER_MULTI_READ ++static inline void __assign_disk_name(char *name, struct gendisk *disk) ++{ ++ if (disk) ++ memcpy(name, disk->disk_name, DISK_NAME_LEN); ++ else ++ memset(name, 0, DISK_NAME_LEN); ++} ++#endif ++ ++TRACE_EVENT(nullb_zone_op, ++ TP_PROTO(struct nullb_cmd *cmd, unsigned int zone_no, ++ unsigned int zone_cond), ++ TP_ARGS(cmd, zone_no, zone_cond), ++ TP_STRUCT__entry( ++ __array(char, disk, DISK_NAME_LEN) ++ __field(enum req_opf, op) ++ __field(unsigned int, zone_no) ++ __field(unsigned int, zone_cond) ++ ), ++ TP_fast_assign( ++ __entry->op = req_op(cmd->rq); ++ __entry->zone_no = zone_no; ++ __entry->zone_cond = zone_cond; ++ __assign_disk_name(__entry->disk, cmd->rq->rq_disk); ++ ), ++ TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s", ++ __print_disk_name(__entry->disk), ++ blk_op_str(__entry->op), ++ __entry->zone_no, ++ blk_zone_cond_str(__entry->zone_cond)) ++); ++ ++TRACE_EVENT(nullb_report_zones, ++ TP_PROTO(struct nullb *nullb, unsigned int nr_zones), ++ TP_ARGS(nullb, nr_zones), ++ TP_STRUCT__entry( ++ __array(char, disk, DISK_NAME_LEN) ++ __field(unsigned int, nr_zones) ++ ), ++ TP_fast_assign( ++ __entry->nr_zones = nr_zones; ++ __assign_disk_name(__entry->disk, nullb->disk); ++ ), ++ TP_printk("%s nr_zones=%u", ++ __print_disk_name(__entry->disk), __entry->nr_zones) ++); ++ ++#endif /* _TRACE_NULLB_H */ ++ ++#undef TRACE_INCLUDE_PATH ++#define TRACE_INCLUDE_PATH . ++#undef TRACE_INCLUDE_FILE ++#define TRACE_INCLUDE_FILE trace ++ ++/* This part must be outside protection */ ++#include <trace/define_trace.h> +diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c +new file mode 100644 +index 0000000000000..41220ce59659b +--- /dev/null ++++ b/drivers/block/null_blk/zoned.c +@@ -0,0 +1,617 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include <linux/vmalloc.h> ++#include <linux/bitmap.h> ++#include "null_blk.h" ++ ++#define CREATE_TRACE_POINTS ++#include "trace.h" ++ ++#define MB_TO_SECTS(mb) (((sector_t)mb * SZ_1M) >> SECTOR_SHIFT) ++ ++static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) ++{ ++ return sect >> ilog2(dev->zone_size_sects); ++} ++ ++int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) ++{ ++ sector_t dev_capacity_sects, zone_capacity_sects; ++ sector_t sector = 0; ++ unsigned int i; ++ ++ if (!is_power_of_2(dev->zone_size)) { ++ pr_err("zone_size must be power-of-two\n"); ++ return -EINVAL; ++ } ++ if (dev->zone_size > dev->size) { ++ pr_err("Zone size larger than device capacity\n"); ++ return -EINVAL; ++ } ++ ++ if (!dev->zone_capacity) ++ dev->zone_capacity = dev->zone_size; ++ ++ if (dev->zone_capacity > dev->zone_size) { ++ pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n", ++ dev->zone_capacity, dev->zone_size); ++ return -EINVAL; ++ } ++ ++ zone_capacity_sects = MB_TO_SECTS(dev->zone_capacity); ++ dev_capacity_sects = MB_TO_SECTS(dev->size); ++ dev->zone_size_sects = MB_TO_SECTS(dev->zone_size); ++ dev->nr_zones = dev_capacity_sects >> ilog2(dev->zone_size_sects); ++ if (dev_capacity_sects & (dev->zone_size_sects - 1)) ++ dev->nr_zones++; ++ ++ dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone), ++ GFP_KERNEL | __GFP_ZERO); ++ if (!dev->zones) ++ return -ENOMEM; ++ ++ /* ++ * With memory backing, the zone_lock spinlock needs to be temporarily ++ * released to avoid scheduling in atomic context. To guarantee zone ++ * information protection, use a bitmap to lock zones with ++ * wait_on_bit_lock_io(). Sleeping on the lock is OK as memory backing ++ * implies that the queue is marked with BLK_MQ_F_BLOCKING. ++ */ ++ spin_lock_init(&dev->zone_lock); ++ if (dev->memory_backed) { ++ dev->zone_locks = bitmap_zalloc(dev->nr_zones, GFP_KERNEL); ++ if (!dev->zone_locks) { ++ kvfree(dev->zones); ++ return -ENOMEM; ++ } ++ } ++ ++ if (dev->zone_nr_conv >= dev->nr_zones) { ++ dev->zone_nr_conv = dev->nr_zones - 1; ++ pr_info("changed the number of conventional zones to %u", ++ dev->zone_nr_conv); ++ } ++ ++ /* Max active zones has to be < nbr of seq zones in order to be enforceable */ ++ if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) { ++ dev->zone_max_active = 0; ++ pr_info("zone_max_active limit disabled, limit >= zone count\n"); ++ } ++ ++ /* Max open zones has to be <= max active zones */ ++ if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) { ++ dev->zone_max_open = dev->zone_max_active; ++ pr_info("changed the maximum number of open zones to %u\n", ++ dev->nr_zones); ++ } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) { ++ dev->zone_max_open = 0; ++ pr_info("zone_max_open limit disabled, limit >= zone count\n"); ++ } ++ ++ for (i = 0; i < dev->zone_nr_conv; i++) { ++ struct blk_zone *zone = &dev->zones[i]; ++ ++ zone->start = sector; ++ zone->len = dev->zone_size_sects; ++ zone->capacity = zone->len; ++ zone->wp = zone->start + zone->len; ++ zone->type = BLK_ZONE_TYPE_CONVENTIONAL; ++ zone->cond = BLK_ZONE_COND_NOT_WP; ++ ++ sector += dev->zone_size_sects; ++ } ++ ++ for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { ++ struct blk_zone *zone = &dev->zones[i]; ++ ++ zone->start = zone->wp = sector; ++ if (zone->start + dev->zone_size_sects > dev_capacity_sects) ++ zone->len = dev_capacity_sects - zone->start; ++ else ++ zone->len = dev->zone_size_sects; ++ zone->capacity = ++ min_t(sector_t, zone->len, zone_capacity_sects); ++ zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; ++ zone->cond = BLK_ZONE_COND_EMPTY; ++ ++ sector += dev->zone_size_sects; ++ } ++ ++ q->limits.zoned = BLK_ZONED_HM; ++ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); ++ blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); ++ ++ return 0; ++} ++ ++int null_register_zoned_dev(struct nullb *nullb) ++{ ++ struct nullb_device *dev = nullb->dev; ++ struct request_queue *q = nullb->q; ++ ++ if (queue_is_mq(q)) { ++ int ret = blk_revalidate_disk_zones(nullb->disk, NULL); ++ ++ if (ret) ++ return ret; ++ } else { ++ blk_queue_chunk_sectors(q, dev->zone_size_sects); ++ q->nr_zones = blkdev_nr_zones(nullb->disk); ++ } ++ ++ blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); ++ blk_queue_max_open_zones(q, dev->zone_max_open); ++ blk_queue_max_active_zones(q, dev->zone_max_active); ++ ++ return 0; ++} ++ ++void null_free_zoned_dev(struct nullb_device *dev) ++{ ++ bitmap_free(dev->zone_locks); ++ kvfree(dev->zones); ++ dev->zones = NULL; ++} ++ ++static inline void null_lock_zone(struct nullb_device *dev, unsigned int zno) ++{ ++ if (dev->memory_backed) ++ wait_on_bit_lock_io(dev->zone_locks, zno, TASK_UNINTERRUPTIBLE); ++ spin_lock_irq(&dev->zone_lock); ++} ++ ++static inline void null_unlock_zone(struct nullb_device *dev, unsigned int zno) ++{ ++ spin_unlock_irq(&dev->zone_lock); ++ ++ if (dev->memory_backed) ++ clear_and_wake_up_bit(zno, dev->zone_locks); ++} ++ ++int null_report_zones(struct gendisk *disk, sector_t sector, ++ unsigned int nr_zones, report_zones_cb cb, void *data) ++{ ++ struct nullb *nullb = disk->private_data; ++ struct nullb_device *dev = nullb->dev; ++ unsigned int first_zone, i, zno; ++ struct blk_zone zone; ++ int error; ++ ++ first_zone = null_zone_no(dev, sector); ++ if (first_zone >= dev->nr_zones) ++ return 0; ++ ++ nr_zones = min(nr_zones, dev->nr_zones - first_zone); ++ trace_nullb_report_zones(nullb, nr_zones); ++ ++ zno = first_zone; ++ for (i = 0; i < nr_zones; i++, zno++) { ++ /* ++ * Stacked DM target drivers will remap the zone information by ++ * modifying the zone information passed to the report callback. ++ * So use a local copy to avoid corruption of the device zone ++ * array. ++ */ ++ null_lock_zone(dev, zno); ++ memcpy(&zone, &dev->zones[zno], sizeof(struct blk_zone)); ++ null_unlock_zone(dev, zno); ++ ++ error = cb(&zone, i, data); ++ if (error) ++ return error; ++ } ++ ++ return nr_zones; ++} ++ ++/* ++ * This is called in the case of memory backing from null_process_cmd() ++ * with the target zone already locked. ++ */ ++size_t null_zone_valid_read_len(struct nullb *nullb, ++ sector_t sector, unsigned int len) ++{ ++ struct nullb_device *dev = nullb->dev; ++ struct blk_zone *zone = &dev->zones[null_zone_no(dev, sector)]; ++ unsigned int nr_sectors = len >> SECTOR_SHIFT; ++ ++ /* Read must be below the write pointer position */ ++ if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL || ++ sector + nr_sectors <= zone->wp) ++ return len; ++ ++ if (sector > zone->wp) ++ return 0; ++ ++ return (zone->wp - sector) << SECTOR_SHIFT; ++} ++ ++static blk_status_t null_close_zone(struct nullb_device *dev, struct blk_zone *zone) ++{ ++ if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) ++ return BLK_STS_IOERR; ++ ++ switch (zone->cond) { ++ case BLK_ZONE_COND_CLOSED: ++ /* close operation on closed is not an error */ ++ return BLK_STS_OK; ++ case BLK_ZONE_COND_IMP_OPEN: ++ dev->nr_zones_imp_open--; ++ break; ++ case BLK_ZONE_COND_EXP_OPEN: ++ dev->nr_zones_exp_open--; ++ break; ++ case BLK_ZONE_COND_EMPTY: ++ case BLK_ZONE_COND_FULL: ++ default: ++ return BLK_STS_IOERR; ++ } ++ ++ if (zone->wp == zone->start) { ++ zone->cond = BLK_ZONE_COND_EMPTY; ++ } else { ++ zone->cond = BLK_ZONE_COND_CLOSED; ++ dev->nr_zones_closed++; ++ } ++ ++ return BLK_STS_OK; ++} ++ ++static void null_close_first_imp_zone(struct nullb_device *dev) ++{ ++ unsigned int i; ++ ++ for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { ++ if (dev->zones[i].cond == BLK_ZONE_COND_IMP_OPEN) { ++ null_close_zone(dev, &dev->zones[i]); ++ return; ++ } ++ } ++} ++ ++static blk_status_t null_check_active(struct nullb_device *dev) ++{ ++ if (!dev->zone_max_active) ++ return BLK_STS_OK; ++ ++ if (dev->nr_zones_exp_open + dev->nr_zones_imp_open + ++ dev->nr_zones_closed < dev->zone_max_active) ++ return BLK_STS_OK; ++ ++ return BLK_STS_ZONE_ACTIVE_RESOURCE; ++} ++ ++static blk_status_t null_check_open(struct nullb_device *dev) ++{ ++ if (!dev->zone_max_open) ++ return BLK_STS_OK; ++ ++ if (dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open) ++ return BLK_STS_OK; ++ ++ if (dev->nr_zones_imp_open) { ++ if (null_check_active(dev) == BLK_STS_OK) { ++ null_close_first_imp_zone(dev); ++ return BLK_STS_OK; ++ } ++ } ++ ++ return BLK_STS_ZONE_OPEN_RESOURCE; ++} ++ ++/* ++ * This function matches the manage open zone resources function in the ZBC standard, ++ * with the addition of max active zones support (added in the ZNS standard). ++ * ++ * The function determines if a zone can transition to implicit open or explicit open, ++ * while maintaining the max open zone (and max active zone) limit(s). It may close an ++ * implicit open zone in order to make additional zone resources available. ++ * ++ * ZBC states that an implicit open zone shall be closed only if there is not ++ * room within the open limit. However, with the addition of an active limit, ++ * it is not certain that closing an implicit open zone will allow a new zone ++ * to be opened, since we might already be at the active limit capacity. ++ */ ++static blk_status_t null_check_zone_resources(struct nullb_device *dev, struct blk_zone *zone) ++{ ++ blk_status_t ret; ++ ++ switch (zone->cond) { ++ case BLK_ZONE_COND_EMPTY: ++ ret = null_check_active(dev); ++ if (ret != BLK_STS_OK) ++ return ret; ++ fallthrough; ++ case BLK_ZONE_COND_CLOSED: ++ return null_check_open(dev); ++ default: ++ /* Should never be called for other states */ ++ WARN_ON(1); ++ return BLK_STS_IOERR; ++ } ++} ++ ++static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, ++ unsigned int nr_sectors, bool append) ++{ ++ struct nullb_device *dev = cmd->nq->dev; ++ unsigned int zno = null_zone_no(dev, sector); ++ struct blk_zone *zone = &dev->zones[zno]; ++ blk_status_t ret; ++ ++ trace_nullb_zone_op(cmd, zno, zone->cond); ++ ++ if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) { ++ if (append) ++ return BLK_STS_IOERR; ++ return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); ++ } ++ ++ null_lock_zone(dev, zno); ++ ++ switch (zone->cond) { ++ case BLK_ZONE_COND_FULL: ++ /* Cannot write to a full zone */ ++ ret = BLK_STS_IOERR; ++ goto unlock; ++ case BLK_ZONE_COND_EMPTY: ++ case BLK_ZONE_COND_CLOSED: ++ ret = null_check_zone_resources(dev, zone); ++ if (ret != BLK_STS_OK) ++ goto unlock; ++ break; ++ case BLK_ZONE_COND_IMP_OPEN: ++ case BLK_ZONE_COND_EXP_OPEN: ++ break; ++ default: ++ /* Invalid zone condition */ ++ ret = BLK_STS_IOERR; ++ goto unlock; ++ } ++ ++ /* ++ * Regular writes must be at the write pointer position. ++ * Zone append writes are automatically issued at the write ++ * pointer and the position returned using the request or BIO ++ * sector. ++ */ ++ if (append) { ++ sector = zone->wp; ++ if (cmd->bio) ++ cmd->bio->bi_iter.bi_sector = sector; ++ else ++ cmd->rq->__sector = sector; ++ } else if (sector != zone->wp) { ++ ret = BLK_STS_IOERR; ++ goto unlock; ++ } ++ ++ if (zone->wp + nr_sectors > zone->start + zone->capacity) { ++ ret = BLK_STS_IOERR; ++ goto unlock; ++ } ++ ++ if (zone->cond == BLK_ZONE_COND_CLOSED) { ++ dev->nr_zones_closed--; ++ dev->nr_zones_imp_open++; ++ } else if (zone->cond == BLK_ZONE_COND_EMPTY) { ++ dev->nr_zones_imp_open++; ++ } ++ if (zone->cond != BLK_ZONE_COND_EXP_OPEN) ++ zone->cond = BLK_ZONE_COND_IMP_OPEN; ++ ++ /* ++ * Memory backing allocation may sleep: release the zone_lock spinlock ++ * to avoid scheduling in atomic context. Zone operation atomicity is ++ * still guaranteed through the zone_locks bitmap. ++ */ ++ if (dev->memory_backed) ++ spin_unlock_irq(&dev->zone_lock); ++ ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); ++ if (dev->memory_backed) ++ spin_lock_irq(&dev->zone_lock); ++ ++ if (ret != BLK_STS_OK) ++ goto unlock; ++ ++ zone->wp += nr_sectors; ++ if (zone->wp == zone->start + zone->capacity) { ++ if (zone->cond == BLK_ZONE_COND_EXP_OPEN) ++ dev->nr_zones_exp_open--; ++ else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) ++ dev->nr_zones_imp_open--; ++ zone->cond = BLK_ZONE_COND_FULL; ++ } ++ ret = BLK_STS_OK; ++ ++unlock: ++ null_unlock_zone(dev, zno); ++ ++ return ret; ++} ++ ++static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zone) ++{ ++ blk_status_t ret; ++ ++ if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) ++ return BLK_STS_IOERR; ++ ++ switch (zone->cond) { ++ case BLK_ZONE_COND_EXP_OPEN: ++ /* open operation on exp open is not an error */ ++ return BLK_STS_OK; ++ case BLK_ZONE_COND_EMPTY: ++ ret = null_check_zone_resources(dev, zone); ++ if (ret != BLK_STS_OK) ++ return ret; ++ break; ++ case BLK_ZONE_COND_IMP_OPEN: ++ dev->nr_zones_imp_open--; ++ break; ++ case BLK_ZONE_COND_CLOSED: ++ ret = null_check_zone_resources(dev, zone); ++ if (ret != BLK_STS_OK) ++ return ret; ++ dev->nr_zones_closed--; ++ break; ++ case BLK_ZONE_COND_FULL: ++ default: ++ return BLK_STS_IOERR; ++ } ++ ++ zone->cond = BLK_ZONE_COND_EXP_OPEN; ++ dev->nr_zones_exp_open++; ++ ++ return BLK_STS_OK; ++} ++ ++static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone *zone) ++{ ++ blk_status_t ret; ++ ++ if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) ++ return BLK_STS_IOERR; ++ ++ switch (zone->cond) { ++ case BLK_ZONE_COND_FULL: ++ /* finish operation on full is not an error */ ++ return BLK_STS_OK; ++ case BLK_ZONE_COND_EMPTY: ++ ret = null_check_zone_resources(dev, zone); ++ if (ret != BLK_STS_OK) ++ return ret; ++ break; ++ case BLK_ZONE_COND_IMP_OPEN: ++ dev->nr_zones_imp_open--; ++ break; ++ case BLK_ZONE_COND_EXP_OPEN: ++ dev->nr_zones_exp_open--; ++ break; ++ case BLK_ZONE_COND_CLOSED: ++ ret = null_check_zone_resources(dev, zone); ++ if (ret != BLK_STS_OK) ++ return ret; ++ dev->nr_zones_closed--; ++ break; ++ default: ++ return BLK_STS_IOERR; ++ } ++ ++ zone->cond = BLK_ZONE_COND_FULL; ++ zone->wp = zone->start + zone->len; ++ ++ return BLK_STS_OK; ++} ++ ++static blk_status_t null_reset_zone(struct nullb_device *dev, struct blk_zone *zone) ++{ ++ if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) ++ return BLK_STS_IOERR; ++ ++ switch (zone->cond) { ++ case BLK_ZONE_COND_EMPTY: ++ /* reset operation on empty is not an error */ ++ return BLK_STS_OK; ++ case BLK_ZONE_COND_IMP_OPEN: ++ dev->nr_zones_imp_open--; ++ break; ++ case BLK_ZONE_COND_EXP_OPEN: ++ dev->nr_zones_exp_open--; ++ break; ++ case BLK_ZONE_COND_CLOSED: ++ dev->nr_zones_closed--; ++ break; ++ case BLK_ZONE_COND_FULL: ++ break; ++ default: ++ return BLK_STS_IOERR; ++ } ++ ++ zone->cond = BLK_ZONE_COND_EMPTY; ++ zone->wp = zone->start; ++ ++ return BLK_STS_OK; ++} ++ ++static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, ++ sector_t sector) ++{ ++ struct nullb_device *dev = cmd->nq->dev; ++ unsigned int zone_no; ++ struct blk_zone *zone; ++ blk_status_t ret; ++ size_t i; ++ ++ if (op == REQ_OP_ZONE_RESET_ALL) { ++ for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { ++ null_lock_zone(dev, i); ++ zone = &dev->zones[i]; ++ if (zone->cond != BLK_ZONE_COND_EMPTY) { ++ null_reset_zone(dev, zone); ++ trace_nullb_zone_op(cmd, i, zone->cond); ++ } ++ null_unlock_zone(dev, i); ++ } ++ return BLK_STS_OK; ++ } ++ ++ zone_no = null_zone_no(dev, sector); ++ zone = &dev->zones[zone_no]; ++ ++ null_lock_zone(dev, zone_no); ++ ++ switch (op) { ++ case REQ_OP_ZONE_RESET: ++ ret = null_reset_zone(dev, zone); ++ break; ++ case REQ_OP_ZONE_OPEN: ++ ret = null_open_zone(dev, zone); ++ break; ++ case REQ_OP_ZONE_CLOSE: ++ ret = null_close_zone(dev, zone); ++ break; ++ case REQ_OP_ZONE_FINISH: ++ ret = null_finish_zone(dev, zone); ++ break; ++ default: ++ ret = BLK_STS_NOTSUPP; ++ break; ++ } ++ ++ if (ret == BLK_STS_OK) ++ trace_nullb_zone_op(cmd, zone_no, zone->cond); ++ ++ null_unlock_zone(dev, zone_no); ++ ++ return ret; ++} ++ ++blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, ++ sector_t sector, sector_t nr_sectors) ++{ ++ struct nullb_device *dev = cmd->nq->dev; ++ unsigned int zno = null_zone_no(dev, sector); ++ blk_status_t sts; ++ ++ switch (op) { ++ case REQ_OP_WRITE: ++ sts = null_zone_write(cmd, sector, nr_sectors, false); ++ break; ++ case REQ_OP_ZONE_APPEND: ++ sts = null_zone_write(cmd, sector, nr_sectors, true); ++ break; ++ case REQ_OP_ZONE_RESET: ++ case REQ_OP_ZONE_RESET_ALL: ++ case REQ_OP_ZONE_OPEN: ++ case REQ_OP_ZONE_CLOSE: ++ case REQ_OP_ZONE_FINISH: ++ sts = null_zone_mgmt(cmd, op, sector); ++ break; ++ default: ++ null_lock_zone(dev, zno); ++ sts = null_process_cmd(cmd, op, sector, nr_sectors); ++ null_unlock_zone(dev, zno); ++ } ++ ++ return sts; ++} +diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c +deleted file mode 100644 +index c6ba8f9f3f311..0000000000000 +--- a/drivers/block/null_blk_main.c ++++ /dev/null +@@ -1,2036 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0-only +-/* +- * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and +- * Shaohua Li <shli@fb.com> +- */ +-#include <linux/module.h> +- +-#include <linux/moduleparam.h> +-#include <linux/sched.h> +-#include <linux/fs.h> +-#include <linux/init.h> +-#include "null_blk.h" +- +-#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) +-#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) +-#define SECTOR_MASK (PAGE_SECTORS - 1) +- +-#define FREE_BATCH 16 +- +-#define TICKS_PER_SEC 50ULL +-#define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) +- +-#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION +-static DECLARE_FAULT_ATTR(null_timeout_attr); +-static DECLARE_FAULT_ATTR(null_requeue_attr); +-static DECLARE_FAULT_ATTR(null_init_hctx_attr); +-#endif +- +-static inline u64 mb_per_tick(int mbps) +-{ +- return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); +-} +- +-/* +- * Status flags for nullb_device. +- * +- * CONFIGURED: Device has been configured and turned on. Cannot reconfigure. +- * UP: Device is currently on and visible in userspace. +- * THROTTLED: Device is being throttled. +- * CACHE: Device is using a write-back cache. +- */ +-enum nullb_device_flags { +- NULLB_DEV_FL_CONFIGURED = 0, +- NULLB_DEV_FL_UP = 1, +- NULLB_DEV_FL_THROTTLED = 2, +- NULLB_DEV_FL_CACHE = 3, +-}; +- +-#define MAP_SZ ((PAGE_SIZE >> SECTOR_SHIFT) + 2) +-/* +- * nullb_page is a page in memory for nullb devices. +- * +- * @page: The page holding the data. +- * @bitmap: The bitmap represents which sector in the page has data. +- * Each bit represents one block size. For example, sector 8 +- * will use the 7th bit +- * The highest 2 bits of bitmap are for special purpose. LOCK means the cache +- * page is being flushing to storage. FREE means the cache page is freed and +- * should be skipped from flushing to storage. Please see +- * null_make_cache_space +- */ +-struct nullb_page { +- struct page *page; +- DECLARE_BITMAP(bitmap, MAP_SZ); +-}; +-#define NULLB_PAGE_LOCK (MAP_SZ - 1) +-#define NULLB_PAGE_FREE (MAP_SZ - 2) +- +-static LIST_HEAD(nullb_list); +-static struct mutex lock; +-static int null_major; +-static DEFINE_IDA(nullb_indexes); +-static struct blk_mq_tag_set tag_set; +- +-enum { +- NULL_IRQ_NONE = 0, +- NULL_IRQ_SOFTIRQ = 1, +- NULL_IRQ_TIMER = 2, +-}; +- +-enum { +- NULL_Q_BIO = 0, +- NULL_Q_RQ = 1, +- NULL_Q_MQ = 2, +-}; +- +-static int g_no_sched; +-module_param_named(no_sched, g_no_sched, int, 0444); +-MODULE_PARM_DESC(no_sched, "No io scheduler"); +- +-static int g_submit_queues = 1; +-module_param_named(submit_queues, g_submit_queues, int, 0444); +-MODULE_PARM_DESC(submit_queues, "Number of submission queues"); +- +-static int g_home_node = NUMA_NO_NODE; +-module_param_named(home_node, g_home_node, int, 0444); +-MODULE_PARM_DESC(home_node, "Home node for the device"); +- +-#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION +-/* +- * For more details about fault injection, please refer to +- * Documentation/fault-injection/fault-injection.rst. +- */ +-static char g_timeout_str[80]; +-module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444); +-MODULE_PARM_DESC(timeout, "Fault injection. timeout=<interval>,<probability>,<space>,<times>"); +- +-static char g_requeue_str[80]; +-module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444); +-MODULE_PARM_DESC(requeue, "Fault injection. requeue=<interval>,<probability>,<space>,<times>"); +- +-static char g_init_hctx_str[80]; +-module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444); +-MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>"); +-#endif +- +-static int g_queue_mode = NULL_Q_MQ; +- +-static int null_param_store_val(const char *str, int *val, int min, int max) +-{ +- int ret, new_val; +- +- ret = kstrtoint(str, 10, &new_val); +- if (ret) +- return -EINVAL; +- +- if (new_val < min || new_val > max) +- return -EINVAL; +- +- *val = new_val; +- return 0; +-} +- +-static int null_set_queue_mode(const char *str, const struct kernel_param *kp) +-{ +- return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ); +-} +- +-static const struct kernel_param_ops null_queue_mode_param_ops = { +- .set = null_set_queue_mode, +- .get = param_get_int, +-}; +- +-device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444); +-MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); +- +-static int g_gb = 250; +-module_param_named(gb, g_gb, int, 0444); +-MODULE_PARM_DESC(gb, "Size in GB"); +- +-static int g_bs = 512; +-module_param_named(bs, g_bs, int, 0444); +-MODULE_PARM_DESC(bs, "Block size (in bytes)"); +- +-static unsigned int nr_devices = 1; +-module_param(nr_devices, uint, 0444); +-MODULE_PARM_DESC(nr_devices, "Number of devices to register"); +- +-static bool g_blocking; +-module_param_named(blocking, g_blocking, bool, 0444); +-MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); +- +-static bool shared_tags; +-module_param(shared_tags, bool, 0444); +-MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); +- +-static bool g_shared_tag_bitmap; +-module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444); +-MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq"); +- +-static int g_irqmode = NULL_IRQ_SOFTIRQ; +- +-static int null_set_irqmode(const char *str, const struct kernel_param *kp) +-{ +- return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE, +- NULL_IRQ_TIMER); +-} +- +-static const struct kernel_param_ops null_irqmode_param_ops = { +- .set = null_set_irqmode, +- .get = param_get_int, +-}; +- +-device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444); +-MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); +- +-static unsigned long g_completion_nsec = 10000; +-module_param_named(completion_nsec, g_completion_nsec, ulong, 0444); +-MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); +- +-static int g_hw_queue_depth = 64; +-module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444); +-MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); +- +-static bool g_use_per_node_hctx; +-module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); +-MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); +- +-static bool g_zoned; +-module_param_named(zoned, g_zoned, bool, S_IRUGO); +-MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); +- +-static unsigned long g_zone_size = 256; +-module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); +-MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); +- +-static unsigned long g_zone_capacity; +-module_param_named(zone_capacity, g_zone_capacity, ulong, 0444); +-MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size"); +- +-static unsigned int g_zone_nr_conv; +-module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444); +-MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0"); +- +-static unsigned int g_zone_max_open; +-module_param_named(zone_max_open, g_zone_max_open, uint, 0444); +-MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)"); +- +-static unsigned int g_zone_max_active; +-module_param_named(zone_max_active, g_zone_max_active, uint, 0444); +-MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); +- +-static struct nullb_device *null_alloc_dev(void); +-static void null_free_dev(struct nullb_device *dev); +-static void null_del_dev(struct nullb *nullb); +-static int null_add_dev(struct nullb_device *dev); +-static void null_free_device_storage(struct nullb_device *dev, bool is_cache); +- +-static inline struct nullb_device *to_nullb_device(struct config_item *item) +-{ +- return item ? container_of(item, struct nullb_device, item) : NULL; +-} +- +-static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page) +-{ +- return snprintf(page, PAGE_SIZE, "%u\n", val); +-} +- +-static inline ssize_t nullb_device_ulong_attr_show(unsigned long val, +- char *page) +-{ +- return snprintf(page, PAGE_SIZE, "%lu\n", val); +-} +- +-static inline ssize_t nullb_device_bool_attr_show(bool val, char *page) +-{ +- return snprintf(page, PAGE_SIZE, "%u\n", val); +-} +- +-static ssize_t nullb_device_uint_attr_store(unsigned int *val, +- const char *page, size_t count) +-{ +- unsigned int tmp; +- int result; +- +- result = kstrtouint(page, 0, &tmp); +- if (result < 0) +- return result; +- +- *val = tmp; +- return count; +-} +- +-static ssize_t nullb_device_ulong_attr_store(unsigned long *val, +- const char *page, size_t count) +-{ +- int result; +- unsigned long tmp; +- +- result = kstrtoul(page, 0, &tmp); +- if (result < 0) +- return result; +- +- *val = tmp; +- return count; +-} +- +-static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, +- size_t count) +-{ +- bool tmp; +- int result; +- +- result = kstrtobool(page, &tmp); +- if (result < 0) +- return result; +- +- *val = tmp; +- return count; +-} +- +-/* The following macro should only be used with TYPE = {uint, ulong, bool}. */ +-#define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY) \ +-static ssize_t \ +-nullb_device_##NAME##_show(struct config_item *item, char *page) \ +-{ \ +- return nullb_device_##TYPE##_attr_show( \ +- to_nullb_device(item)->NAME, page); \ +-} \ +-static ssize_t \ +-nullb_device_##NAME##_store(struct config_item *item, const char *page, \ +- size_t count) \ +-{ \ +- int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\ +- struct nullb_device *dev = to_nullb_device(item); \ +- TYPE new_value = 0; \ +- int ret; \ +- \ +- ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\ +- if (ret < 0) \ +- return ret; \ +- if (apply_fn) \ +- ret = apply_fn(dev, new_value); \ +- else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) \ +- ret = -EBUSY; \ +- if (ret < 0) \ +- return ret; \ +- dev->NAME = new_value; \ +- return count; \ +-} \ +-CONFIGFS_ATTR(nullb_device_, NAME); +- +-static int nullb_apply_submit_queues(struct nullb_device *dev, +- unsigned int submit_queues) +-{ +- struct nullb *nullb = dev->nullb; +- struct blk_mq_tag_set *set; +- +- if (!nullb) +- return 0; +- +- /* +- * Make sure that null_init_hctx() does not access nullb->queues[] past +- * the end of that array. +- */ +- if (submit_queues > nr_cpu_ids) +- return -EINVAL; +- set = nullb->tag_set; +- blk_mq_update_nr_hw_queues(set, submit_queues); +- return set->nr_hw_queues == submit_queues ? 0 : -ENOMEM; +-} +- +-NULLB_DEVICE_ATTR(size, ulong, NULL); +-NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL); +-NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues); +-NULLB_DEVICE_ATTR(home_node, uint, NULL); +-NULLB_DEVICE_ATTR(queue_mode, uint, NULL); +-NULLB_DEVICE_ATTR(blocksize, uint, NULL); +-NULLB_DEVICE_ATTR(irqmode, uint, NULL); +-NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); +-NULLB_DEVICE_ATTR(index, uint, NULL); +-NULLB_DEVICE_ATTR(blocking, bool, NULL); +-NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL); +-NULLB_DEVICE_ATTR(memory_backed, bool, NULL); +-NULLB_DEVICE_ATTR(discard, bool, NULL); +-NULLB_DEVICE_ATTR(mbps, uint, NULL); +-NULLB_DEVICE_ATTR(cache_size, ulong, NULL); +-NULLB_DEVICE_ATTR(zoned, bool, NULL); +-NULLB_DEVICE_ATTR(zone_size, ulong, NULL); +-NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); +-NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); +-NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); +-NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); +- +-static ssize_t nullb_device_power_show(struct config_item *item, char *page) +-{ +- return nullb_device_bool_attr_show(to_nullb_device(item)->power, page); +-} +- +-static ssize_t nullb_device_power_store(struct config_item *item, +- const char *page, size_t count) +-{ +- struct nullb_device *dev = to_nullb_device(item); +- bool newp = false; +- ssize_t ret; +- +- ret = nullb_device_bool_attr_store(&newp, page, count); +- if (ret < 0) +- return ret; +- +- if (!dev->power && newp) { +- if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) +- return count; +- if (null_add_dev(dev)) { +- clear_bit(NULLB_DEV_FL_UP, &dev->flags); +- return -ENOMEM; +- } +- +- set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); +- dev->power = newp; +- } else if (dev->power && !newp) { +- if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { +- mutex_lock(&lock); +- dev->power = newp; +- null_del_dev(dev->nullb); +- mutex_unlock(&lock); +- } +- clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); +- } +- +- return count; +-} +- +-CONFIGFS_ATTR(nullb_device_, power); +- +-static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page) +-{ +- struct nullb_device *t_dev = to_nullb_device(item); +- +- return badblocks_show(&t_dev->badblocks, page, 0); +-} +- +-static ssize_t nullb_device_badblocks_store(struct config_item *item, +- const char *page, size_t count) +-{ +- struct nullb_device *t_dev = to_nullb_device(item); +- char *orig, *buf, *tmp; +- u64 start, end; +- int ret; +- +- orig = kstrndup(page, count, GFP_KERNEL); +- if (!orig) +- return -ENOMEM; +- +- buf = strstrip(orig); +- +- ret = -EINVAL; +- if (buf[0] != '+' && buf[0] != '-') +- goto out; +- tmp = strchr(&buf[1], '-'); +- if (!tmp) +- goto out; +- *tmp = '\0'; +- ret = kstrtoull(buf + 1, 0, &start); +- if (ret) +- goto out; +- ret = kstrtoull(tmp + 1, 0, &end); +- if (ret) +- goto out; +- ret = -EINVAL; +- if (start > end) +- goto out; +- /* enable badblocks */ +- cmpxchg(&t_dev->badblocks.shift, -1, 0); +- if (buf[0] == '+') +- ret = badblocks_set(&t_dev->badblocks, start, +- end - start + 1, 1); +- else +- ret = badblocks_clear(&t_dev->badblocks, start, +- end - start + 1); +- if (ret == 0) +- ret = count; +-out: +- kfree(orig); +- return ret; +-} +-CONFIGFS_ATTR(nullb_device_, badblocks); +- +-static struct configfs_attribute *nullb_device_attrs[] = { +- &nullb_device_attr_size, +- &nullb_device_attr_completion_nsec, +- &nullb_device_attr_submit_queues, +- &nullb_device_attr_home_node, +- &nullb_device_attr_queue_mode, +- &nullb_device_attr_blocksize, +- &nullb_device_attr_irqmode, +- &nullb_device_attr_hw_queue_depth, +- &nullb_device_attr_index, +- &nullb_device_attr_blocking, +- &nullb_device_attr_use_per_node_hctx, +- &nullb_device_attr_power, +- &nullb_device_attr_memory_backed, +- &nullb_device_attr_discard, +- &nullb_device_attr_mbps, +- &nullb_device_attr_cache_size, +- &nullb_device_attr_badblocks, +- &nullb_device_attr_zoned, +- &nullb_device_attr_zone_size, +- &nullb_device_attr_zone_capacity, +- &nullb_device_attr_zone_nr_conv, +- &nullb_device_attr_zone_max_open, +- &nullb_device_attr_zone_max_active, +- NULL, +-}; +- +-static void nullb_device_release(struct config_item *item) +-{ +- struct nullb_device *dev = to_nullb_device(item); +- +- null_free_device_storage(dev, false); +- null_free_dev(dev); +-} +- +-static struct configfs_item_operations nullb_device_ops = { +- .release = nullb_device_release, +-}; +- +-static const struct config_item_type nullb_device_type = { +- .ct_item_ops = &nullb_device_ops, +- .ct_attrs = nullb_device_attrs, +- .ct_owner = THIS_MODULE, +-}; +- +-static struct +-config_item *nullb_group_make_item(struct config_group *group, const char *name) +-{ +- struct nullb_device *dev; +- +- dev = null_alloc_dev(); +- if (!dev) +- return ERR_PTR(-ENOMEM); +- +- config_item_init_type_name(&dev->item, name, &nullb_device_type); +- +- return &dev->item; +-} +- +-static void +-nullb_group_drop_item(struct config_group *group, struct config_item *item) +-{ +- struct nullb_device *dev = to_nullb_device(item); +- +- if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { +- mutex_lock(&lock); +- dev->power = false; +- null_del_dev(dev->nullb); +- mutex_unlock(&lock); +- } +- +- config_item_put(item); +-} +- +-static ssize_t memb_group_features_show(struct config_item *item, char *page) +-{ +- return snprintf(page, PAGE_SIZE, +- "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active\n"); +-} +- +-CONFIGFS_ATTR_RO(memb_group_, features); +- +-static struct configfs_attribute *nullb_group_attrs[] = { +- &memb_group_attr_features, +- NULL, +-}; +- +-static struct configfs_group_operations nullb_group_ops = { +- .make_item = nullb_group_make_item, +- .drop_item = nullb_group_drop_item, +-}; +- +-static const struct config_item_type nullb_group_type = { +- .ct_group_ops = &nullb_group_ops, +- .ct_attrs = nullb_group_attrs, +- .ct_owner = THIS_MODULE, +-}; +- +-static struct configfs_subsystem nullb_subsys = { +- .su_group = { +- .cg_item = { +- .ci_namebuf = "nullb", +- .ci_type = &nullb_group_type, +- }, +- }, +-}; +- +-static inline int null_cache_active(struct nullb *nullb) +-{ +- return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); +-} +- +-static struct nullb_device *null_alloc_dev(void) +-{ +- struct nullb_device *dev; +- +- dev = kzalloc(sizeof(*dev), GFP_KERNEL); +- if (!dev) +- return NULL; +- INIT_RADIX_TREE(&dev->data, GFP_ATOMIC); +- INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC); +- if (badblocks_init(&dev->badblocks, 0)) { +- kfree(dev); +- return NULL; +- } +- +- dev->size = g_gb * 1024; +- dev->completion_nsec = g_completion_nsec; +- dev->submit_queues = g_submit_queues; +- dev->home_node = g_home_node; +- dev->queue_mode = g_queue_mode; +- dev->blocksize = g_bs; +- dev->irqmode = g_irqmode; +- dev->hw_queue_depth = g_hw_queue_depth; +- dev->blocking = g_blocking; +- dev->use_per_node_hctx = g_use_per_node_hctx; +- dev->zoned = g_zoned; +- dev->zone_size = g_zone_size; +- dev->zone_capacity = g_zone_capacity; +- dev->zone_nr_conv = g_zone_nr_conv; +- dev->zone_max_open = g_zone_max_open; +- dev->zone_max_active = g_zone_max_active; +- return dev; +-} +- +-static void null_free_dev(struct nullb_device *dev) +-{ +- if (!dev) +- return; +- +- null_free_zoned_dev(dev); +- badblocks_exit(&dev->badblocks); +- kfree(dev); +-} +- +-static void put_tag(struct nullb_queue *nq, unsigned int tag) +-{ +- clear_bit_unlock(tag, nq->tag_map); +- +- if (waitqueue_active(&nq->wait)) +- wake_up(&nq->wait); +-} +- +-static unsigned int get_tag(struct nullb_queue *nq) +-{ +- unsigned int tag; +- +- do { +- tag = find_first_zero_bit(nq->tag_map, nq->queue_depth); +- if (tag >= nq->queue_depth) +- return -1U; +- } while (test_and_set_bit_lock(tag, nq->tag_map)); +- +- return tag; +-} +- +-static void free_cmd(struct nullb_cmd *cmd) +-{ +- put_tag(cmd->nq, cmd->tag); +-} +- +-static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer); +- +-static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) +-{ +- struct nullb_cmd *cmd; +- unsigned int tag; +- +- tag = get_tag(nq); +- if (tag != -1U) { +- cmd = &nq->cmds[tag]; +- cmd->tag = tag; +- cmd->error = BLK_STS_OK; +- cmd->nq = nq; +- if (nq->dev->irqmode == NULL_IRQ_TIMER) { +- hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, +- HRTIMER_MODE_REL); +- cmd->timer.function = null_cmd_timer_expired; +- } +- return cmd; +- } +- +- return NULL; +-} +- +-static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) +-{ +- struct nullb_cmd *cmd; +- DEFINE_WAIT(wait); +- +- cmd = __alloc_cmd(nq); +- if (cmd || !can_wait) +- return cmd; +- +- do { +- prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); +- cmd = __alloc_cmd(nq); +- if (cmd) +- break; +- +- io_schedule(); +- } while (1); +- +- finish_wait(&nq->wait, &wait); +- return cmd; +-} +- +-static void end_cmd(struct nullb_cmd *cmd) +-{ +- int queue_mode = cmd->nq->dev->queue_mode; +- +- switch (queue_mode) { +- case NULL_Q_MQ: +- blk_mq_end_request(cmd->rq, cmd->error); +- return; +- case NULL_Q_BIO: +- cmd->bio->bi_status = cmd->error; +- bio_endio(cmd->bio); +- break; +- } +- +- free_cmd(cmd); +-} +- +-static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) +-{ +- end_cmd(container_of(timer, struct nullb_cmd, timer)); +- +- return HRTIMER_NORESTART; +-} +- +-static void null_cmd_end_timer(struct nullb_cmd *cmd) +-{ +- ktime_t kt = cmd->nq->dev->completion_nsec; +- +- hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL); +-} +- +-static void null_complete_rq(struct request *rq) +-{ +- end_cmd(blk_mq_rq_to_pdu(rq)); +-} +- +-static struct nullb_page *null_alloc_page(gfp_t gfp_flags) +-{ +- struct nullb_page *t_page; +- +- t_page = kmalloc(sizeof(struct nullb_page), gfp_flags); +- if (!t_page) +- goto out; +- +- t_page->page = alloc_pages(gfp_flags, 0); +- if (!t_page->page) +- goto out_freepage; +- +- memset(t_page->bitmap, 0, sizeof(t_page->bitmap)); +- return t_page; +-out_freepage: +- kfree(t_page); +-out: +- return NULL; +-} +- +-static void null_free_page(struct nullb_page *t_page) +-{ +- __set_bit(NULLB_PAGE_FREE, t_page->bitmap); +- if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap)) +- return; +- __free_page(t_page->page); +- kfree(t_page); +-} +- +-static bool null_page_empty(struct nullb_page *page) +-{ +- int size = MAP_SZ - 2; +- +- return find_first_bit(page->bitmap, size) == size; +-} +- +-static void null_free_sector(struct nullb *nullb, sector_t sector, +- bool is_cache) +-{ +- unsigned int sector_bit; +- u64 idx; +- struct nullb_page *t_page, *ret; +- struct radix_tree_root *root; +- +- root = is_cache ? &nullb->dev->cache : &nullb->dev->data; +- idx = sector >> PAGE_SECTORS_SHIFT; +- sector_bit = (sector & SECTOR_MASK); +- +- t_page = radix_tree_lookup(root, idx); +- if (t_page) { +- __clear_bit(sector_bit, t_page->bitmap); +- +- if (null_page_empty(t_page)) { +- ret = radix_tree_delete_item(root, idx, t_page); +- WARN_ON(ret != t_page); +- null_free_page(ret); +- if (is_cache) +- nullb->dev->curr_cache -= PAGE_SIZE; +- } +- } +-} +- +-static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx, +- struct nullb_page *t_page, bool is_cache) +-{ +- struct radix_tree_root *root; +- +- root = is_cache ? &nullb->dev->cache : &nullb->dev->data; +- +- if (radix_tree_insert(root, idx, t_page)) { +- null_free_page(t_page); +- t_page = radix_tree_lookup(root, idx); +- WARN_ON(!t_page || t_page->page->index != idx); +- } else if (is_cache) +- nullb->dev->curr_cache += PAGE_SIZE; +- +- return t_page; +-} +- +-static void null_free_device_storage(struct nullb_device *dev, bool is_cache) +-{ +- unsigned long pos = 0; +- int nr_pages; +- struct nullb_page *ret, *t_pages[FREE_BATCH]; +- struct radix_tree_root *root; +- +- root = is_cache ? &dev->cache : &dev->data; +- +- do { +- int i; +- +- nr_pages = radix_tree_gang_lookup(root, +- (void **)t_pages, pos, FREE_BATCH); +- +- for (i = 0; i < nr_pages; i++) { +- pos = t_pages[i]->page->index; +- ret = radix_tree_delete_item(root, pos, t_pages[i]); +- WARN_ON(ret != t_pages[i]); +- null_free_page(ret); +- } +- +- pos++; +- } while (nr_pages == FREE_BATCH); +- +- if (is_cache) +- dev->curr_cache = 0; +-} +- +-static struct nullb_page *__null_lookup_page(struct nullb *nullb, +- sector_t sector, bool for_write, bool is_cache) +-{ +- unsigned int sector_bit; +- u64 idx; +- struct nullb_page *t_page; +- struct radix_tree_root *root; +- +- idx = sector >> PAGE_SECTORS_SHIFT; +- sector_bit = (sector & SECTOR_MASK); +- +- root = is_cache ? &nullb->dev->cache : &nullb->dev->data; +- t_page = radix_tree_lookup(root, idx); +- WARN_ON(t_page && t_page->page->index != idx); +- +- if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap))) +- return t_page; +- +- return NULL; +-} +- +-static struct nullb_page *null_lookup_page(struct nullb *nullb, +- sector_t sector, bool for_write, bool ignore_cache) +-{ +- struct nullb_page *page = NULL; +- +- if (!ignore_cache) +- page = __null_lookup_page(nullb, sector, for_write, true); +- if (page) +- return page; +- return __null_lookup_page(nullb, sector, for_write, false); +-} +- +-static struct nullb_page *null_insert_page(struct nullb *nullb, +- sector_t sector, bool ignore_cache) +- __releases(&nullb->lock) +- __acquires(&nullb->lock) +-{ +- u64 idx; +- struct nullb_page *t_page; +- +- t_page = null_lookup_page(nullb, sector, true, ignore_cache); +- if (t_page) +- return t_page; +- +- spin_unlock_irq(&nullb->lock); +- +- t_page = null_alloc_page(GFP_NOIO); +- if (!t_page) +- goto out_lock; +- +- if (radix_tree_preload(GFP_NOIO)) +- goto out_freepage; +- +- spin_lock_irq(&nullb->lock); +- idx = sector >> PAGE_SECTORS_SHIFT; +- t_page->page->index = idx; +- t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache); +- radix_tree_preload_end(); +- +- return t_page; +-out_freepage: +- null_free_page(t_page); +-out_lock: +- spin_lock_irq(&nullb->lock); +- return null_lookup_page(nullb, sector, true, ignore_cache); +-} +- +-static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) +-{ +- int i; +- unsigned int offset; +- u64 idx; +- struct nullb_page *t_page, *ret; +- void *dst, *src; +- +- idx = c_page->page->index; +- +- t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true); +- +- __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap); +- if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) { +- null_free_page(c_page); +- if (t_page && null_page_empty(t_page)) { +- ret = radix_tree_delete_item(&nullb->dev->data, +- idx, t_page); +- null_free_page(t_page); +- } +- return 0; +- } +- +- if (!t_page) +- return -ENOMEM; +- +- src = kmap_atomic(c_page->page); +- dst = kmap_atomic(t_page->page); +- +- for (i = 0; i < PAGE_SECTORS; +- i += (nullb->dev->blocksize >> SECTOR_SHIFT)) { +- if (test_bit(i, c_page->bitmap)) { +- offset = (i << SECTOR_SHIFT); +- memcpy(dst + offset, src + offset, +- nullb->dev->blocksize); +- __set_bit(i, t_page->bitmap); +- } +- } +- +- kunmap_atomic(dst); +- kunmap_atomic(src); +- +- ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page); +- null_free_page(ret); +- nullb->dev->curr_cache -= PAGE_SIZE; +- +- return 0; +-} +- +-static int null_make_cache_space(struct nullb *nullb, unsigned long n) +-{ +- int i, err, nr_pages; +- struct nullb_page *c_pages[FREE_BATCH]; +- unsigned long flushed = 0, one_round; +- +-again: +- if ((nullb->dev->cache_size * 1024 * 1024) > +- nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0) +- return 0; +- +- nr_pages = radix_tree_gang_lookup(&nullb->dev->cache, +- (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH); +- /* +- * nullb_flush_cache_page could unlock before using the c_pages. To +- * avoid race, we don't allow page free +- */ +- for (i = 0; i < nr_pages; i++) { +- nullb->cache_flush_pos = c_pages[i]->page->index; +- /* +- * We found the page which is being flushed to disk by other +- * threads +- */ +- if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap)) +- c_pages[i] = NULL; +- else +- __set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap); +- } +- +- one_round = 0; +- for (i = 0; i < nr_pages; i++) { +- if (c_pages[i] == NULL) +- continue; +- err = null_flush_cache_page(nullb, c_pages[i]); +- if (err) +- return err; +- one_round++; +- } +- flushed += one_round << PAGE_SHIFT; +- +- if (n > flushed) { +- if (nr_pages == 0) +- nullb->cache_flush_pos = 0; +- if (one_round == 0) { +- /* give other threads a chance */ +- spin_unlock_irq(&nullb->lock); +- spin_lock_irq(&nullb->lock); +- } +- goto again; +- } +- return 0; +-} +- +-static int copy_to_nullb(struct nullb *nullb, struct page *source, +- unsigned int off, sector_t sector, size_t n, bool is_fua) +-{ +- size_t temp, count = 0; +- unsigned int offset; +- struct nullb_page *t_page; +- void *dst, *src; +- +- while (count < n) { +- temp = min_t(size_t, nullb->dev->blocksize, n - count); +- +- if (null_cache_active(nullb) && !is_fua) +- null_make_cache_space(nullb, PAGE_SIZE); +- +- offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; +- t_page = null_insert_page(nullb, sector, +- !null_cache_active(nullb) || is_fua); +- if (!t_page) +- return -ENOSPC; +- +- src = kmap_atomic(source); +- dst = kmap_atomic(t_page->page); +- memcpy(dst + offset, src + off + count, temp); +- kunmap_atomic(dst); +- kunmap_atomic(src); +- +- __set_bit(sector & SECTOR_MASK, t_page->bitmap); +- +- if (is_fua) +- null_free_sector(nullb, sector, true); +- +- count += temp; +- sector += temp >> SECTOR_SHIFT; +- } +- return 0; +-} +- +-static int copy_from_nullb(struct nullb *nullb, struct page *dest, +- unsigned int off, sector_t sector, size_t n) +-{ +- size_t temp, count = 0; +- unsigned int offset; +- struct nullb_page *t_page; +- void *dst, *src; +- +- while (count < n) { +- temp = min_t(size_t, nullb->dev->blocksize, n - count); +- +- offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; +- t_page = null_lookup_page(nullb, sector, false, +- !null_cache_active(nullb)); +- +- dst = kmap_atomic(dest); +- if (!t_page) { +- memset(dst + off + count, 0, temp); +- goto next; +- } +- src = kmap_atomic(t_page->page); +- memcpy(dst + off + count, src + offset, temp); +- kunmap_atomic(src); +-next: +- kunmap_atomic(dst); +- +- count += temp; +- sector += temp >> SECTOR_SHIFT; +- } +- return 0; +-} +- +-static void nullb_fill_pattern(struct nullb *nullb, struct page *page, +- unsigned int len, unsigned int off) +-{ +- void *dst; +- +- dst = kmap_atomic(page); +- memset(dst + off, 0xFF, len); +- kunmap_atomic(dst); +-} +- +-static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n) +-{ +- size_t temp; +- +- spin_lock_irq(&nullb->lock); +- while (n > 0) { +- temp = min_t(size_t, n, nullb->dev->blocksize); +- null_free_sector(nullb, sector, false); +- if (null_cache_active(nullb)) +- null_free_sector(nullb, sector, true); +- sector += temp >> SECTOR_SHIFT; +- n -= temp; +- } +- spin_unlock_irq(&nullb->lock); +-} +- +-static int null_handle_flush(struct nullb *nullb) +-{ +- int err; +- +- if (!null_cache_active(nullb)) +- return 0; +- +- spin_lock_irq(&nullb->lock); +- while (true) { +- err = null_make_cache_space(nullb, +- nullb->dev->cache_size * 1024 * 1024); +- if (err || nullb->dev->curr_cache == 0) +- break; +- } +- +- WARN_ON(!radix_tree_empty(&nullb->dev->cache)); +- spin_unlock_irq(&nullb->lock); +- return err; +-} +- +-static int null_transfer(struct nullb *nullb, struct page *page, +- unsigned int len, unsigned int off, bool is_write, sector_t sector, +- bool is_fua) +-{ +- struct nullb_device *dev = nullb->dev; +- unsigned int valid_len = len; +- int err = 0; +- +- if (!is_write) { +- if (dev->zoned) +- valid_len = null_zone_valid_read_len(nullb, +- sector, len); +- +- if (valid_len) { +- err = copy_from_nullb(nullb, page, off, +- sector, valid_len); +- off += valid_len; +- len -= valid_len; +- } +- +- if (len) +- nullb_fill_pattern(nullb, page, len, off); +- flush_dcache_page(page); +- } else { +- flush_dcache_page(page); +- err = copy_to_nullb(nullb, page, off, sector, len, is_fua); +- } +- +- return err; +-} +- +-static int null_handle_rq(struct nullb_cmd *cmd) +-{ +- struct request *rq = cmd->rq; +- struct nullb *nullb = cmd->nq->dev->nullb; +- int err; +- unsigned int len; +- sector_t sector; +- struct req_iterator iter; +- struct bio_vec bvec; +- +- sector = blk_rq_pos(rq); +- +- if (req_op(rq) == REQ_OP_DISCARD) { +- null_handle_discard(nullb, sector, blk_rq_bytes(rq)); +- return 0; +- } +- +- spin_lock_irq(&nullb->lock); +- rq_for_each_segment(bvec, rq, iter) { +- len = bvec.bv_len; +- err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, +- op_is_write(req_op(rq)), sector, +- rq->cmd_flags & REQ_FUA); +- if (err) { +- spin_unlock_irq(&nullb->lock); +- return err; +- } +- sector += len >> SECTOR_SHIFT; +- } +- spin_unlock_irq(&nullb->lock); +- +- return 0; +-} +- +-static int null_handle_bio(struct nullb_cmd *cmd) +-{ +- struct bio *bio = cmd->bio; +- struct nullb *nullb = cmd->nq->dev->nullb; +- int err; +- unsigned int len; +- sector_t sector; +- struct bio_vec bvec; +- struct bvec_iter iter; +- +- sector = bio->bi_iter.bi_sector; +- +- if (bio_op(bio) == REQ_OP_DISCARD) { +- null_handle_discard(nullb, sector, +- bio_sectors(bio) << SECTOR_SHIFT); +- return 0; +- } +- +- spin_lock_irq(&nullb->lock); +- bio_for_each_segment(bvec, bio, iter) { +- len = bvec.bv_len; +- err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, +- op_is_write(bio_op(bio)), sector, +- bio->bi_opf & REQ_FUA); +- if (err) { +- spin_unlock_irq(&nullb->lock); +- return err; +- } +- sector += len >> SECTOR_SHIFT; +- } +- spin_unlock_irq(&nullb->lock); +- return 0; +-} +- +-static void null_stop_queue(struct nullb *nullb) +-{ +- struct request_queue *q = nullb->q; +- +- if (nullb->dev->queue_mode == NULL_Q_MQ) +- blk_mq_stop_hw_queues(q); +-} +- +-static void null_restart_queue_async(struct nullb *nullb) +-{ +- struct request_queue *q = nullb->q; +- +- if (nullb->dev->queue_mode == NULL_Q_MQ) +- blk_mq_start_stopped_hw_queues(q, true); +-} +- +-static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) +-{ +- struct nullb_device *dev = cmd->nq->dev; +- struct nullb *nullb = dev->nullb; +- blk_status_t sts = BLK_STS_OK; +- struct request *rq = cmd->rq; +- +- if (!hrtimer_active(&nullb->bw_timer)) +- hrtimer_restart(&nullb->bw_timer); +- +- if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) { +- null_stop_queue(nullb); +- /* race with timer */ +- if (atomic_long_read(&nullb->cur_bytes) > 0) +- null_restart_queue_async(nullb); +- /* requeue request */ +- sts = BLK_STS_DEV_RESOURCE; +- } +- return sts; +-} +- +-static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, +- sector_t sector, +- sector_t nr_sectors) +-{ +- struct badblocks *bb = &cmd->nq->dev->badblocks; +- sector_t first_bad; +- int bad_sectors; +- +- if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors)) +- return BLK_STS_IOERR; +- +- return BLK_STS_OK; +-} +- +-static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, +- enum req_opf op) +-{ +- struct nullb_device *dev = cmd->nq->dev; +- int err; +- +- if (dev->queue_mode == NULL_Q_BIO) +- err = null_handle_bio(cmd); +- else +- err = null_handle_rq(cmd); +- +- return errno_to_blk_status(err); +-} +- +-static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) +-{ +- struct nullb_device *dev = cmd->nq->dev; +- struct bio *bio; +- +- if (dev->memory_backed) +- return; +- +- if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) { +- zero_fill_bio(cmd->bio); +- } else if (req_op(cmd->rq) == REQ_OP_READ) { +- __rq_for_each_bio(bio, cmd->rq) +- zero_fill_bio(bio); +- } +-} +- +-static inline void nullb_complete_cmd(struct nullb_cmd *cmd) +-{ +- /* +- * Since root privileges are required to configure the null_blk +- * driver, it is fine that this driver does not initialize the +- * data buffers of read commands. Zero-initialize these buffers +- * anyway if KMSAN is enabled to prevent that KMSAN complains +- * about null_blk not initializing read data buffers. +- */ +- if (IS_ENABLED(CONFIG_KMSAN)) +- nullb_zero_read_cmd_buffer(cmd); +- +- /* Complete IO by inline, softirq or timer */ +- switch (cmd->nq->dev->irqmode) { +- case NULL_IRQ_SOFTIRQ: +- switch (cmd->nq->dev->queue_mode) { +- case NULL_Q_MQ: +- if (likely(!blk_should_fake_timeout(cmd->rq->q))) +- blk_mq_complete_request(cmd->rq); +- break; +- case NULL_Q_BIO: +- /* +- * XXX: no proper submitting cpu information available. +- */ +- end_cmd(cmd); +- break; +- } +- break; +- case NULL_IRQ_NONE: +- end_cmd(cmd); +- break; +- case NULL_IRQ_TIMER: +- null_cmd_end_timer(cmd); +- break; +- } +-} +- +-blk_status_t null_process_cmd(struct nullb_cmd *cmd, +- enum req_opf op, sector_t sector, +- unsigned int nr_sectors) +-{ +- struct nullb_device *dev = cmd->nq->dev; +- blk_status_t ret; +- +- if (dev->badblocks.shift != -1) { +- ret = null_handle_badblocks(cmd, sector, nr_sectors); +- if (ret != BLK_STS_OK) +- return ret; +- } +- +- if (dev->memory_backed) +- return null_handle_memory_backed(cmd, op); +- +- return BLK_STS_OK; +-} +- +-static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, +- sector_t nr_sectors, enum req_opf op) +-{ +- struct nullb_device *dev = cmd->nq->dev; +- struct nullb *nullb = dev->nullb; +- blk_status_t sts; +- +- if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { +- sts = null_handle_throttled(cmd); +- if (sts != BLK_STS_OK) +- return sts; +- } +- +- if (op == REQ_OP_FLUSH) { +- cmd->error = errno_to_blk_status(null_handle_flush(nullb)); +- goto out; +- } +- +- if (dev->zoned) +- sts = null_process_zoned_cmd(cmd, op, sector, nr_sectors); +- else +- sts = null_process_cmd(cmd, op, sector, nr_sectors); +- +- /* Do not overwrite errors (e.g. timeout errors) */ +- if (cmd->error == BLK_STS_OK) +- cmd->error = sts; +- +-out: +- nullb_complete_cmd(cmd); +- return BLK_STS_OK; +-} +- +-static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer) +-{ +- struct nullb *nullb = container_of(timer, struct nullb, bw_timer); +- ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); +- unsigned int mbps = nullb->dev->mbps; +- +- if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps)) +- return HRTIMER_NORESTART; +- +- atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps)); +- null_restart_queue_async(nullb); +- +- hrtimer_forward_now(&nullb->bw_timer, timer_interval); +- +- return HRTIMER_RESTART; +-} +- +-static void nullb_setup_bwtimer(struct nullb *nullb) +-{ +- ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); +- +- hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +- nullb->bw_timer.function = nullb_bwtimer_fn; +- atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps)); +- hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); +-} +- +-static struct nullb_queue *nullb_to_queue(struct nullb *nullb) +-{ +- int index = 0; +- +- if (nullb->nr_queues != 1) +- index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues); +- +- return &nullb->queues[index]; +-} +- +-static blk_qc_t null_submit_bio(struct bio *bio) +-{ +- sector_t sector = bio->bi_iter.bi_sector; +- sector_t nr_sectors = bio_sectors(bio); +- struct nullb *nullb = bio->bi_disk->private_data; +- struct nullb_queue *nq = nullb_to_queue(nullb); +- struct nullb_cmd *cmd; +- +- cmd = alloc_cmd(nq, 1); +- cmd->bio = bio; +- +- null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio)); +- return BLK_QC_T_NONE; +-} +- +-static bool should_timeout_request(struct request *rq) +-{ +-#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION +- if (g_timeout_str[0]) +- return should_fail(&null_timeout_attr, 1); +-#endif +- return false; +-} +- +-static bool should_requeue_request(struct request *rq) +-{ +-#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION +- if (g_requeue_str[0]) +- return should_fail(&null_requeue_attr, 1); +-#endif +- return false; +-} +- +-static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) +-{ +- struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); +- +- pr_info("rq %p timed out\n", rq); +- +- /* +- * If the device is marked as blocking (i.e. memory backed or zoned +- * device), the submission path may be blocked waiting for resources +- * and cause real timeouts. For these real timeouts, the submission +- * path will complete the request using blk_mq_complete_request(). +- * Only fake timeouts need to execute blk_mq_complete_request() here. +- */ +- cmd->error = BLK_STS_TIMEOUT; +- if (cmd->fake_timeout) +- blk_mq_complete_request(rq); +- return BLK_EH_DONE; +-} +- +-static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, +- const struct blk_mq_queue_data *bd) +-{ +- struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); +- struct nullb_queue *nq = hctx->driver_data; +- sector_t nr_sectors = blk_rq_sectors(bd->rq); +- sector_t sector = blk_rq_pos(bd->rq); +- +- might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); +- +- if (nq->dev->irqmode == NULL_IRQ_TIMER) { +- hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +- cmd->timer.function = null_cmd_timer_expired; +- } +- cmd->rq = bd->rq; +- cmd->error = BLK_STS_OK; +- cmd->nq = nq; +- cmd->fake_timeout = should_timeout_request(bd->rq); +- +- blk_mq_start_request(bd->rq); +- +- if (should_requeue_request(bd->rq)) { +- /* +- * Alternate between hitting the core BUSY path, and the +- * driver driven requeue path +- */ +- nq->requeue_selection++; +- if (nq->requeue_selection & 1) +- return BLK_STS_RESOURCE; +- else { +- blk_mq_requeue_request(bd->rq, true); +- return BLK_STS_OK; +- } +- } +- if (cmd->fake_timeout) +- return BLK_STS_OK; +- +- return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq)); +-} +- +-static void cleanup_queue(struct nullb_queue *nq) +-{ +- kfree(nq->tag_map); +- kfree(nq->cmds); +-} +- +-static void cleanup_queues(struct nullb *nullb) +-{ +- int i; +- +- for (i = 0; i < nullb->nr_queues; i++) +- cleanup_queue(&nullb->queues[i]); +- +- kfree(nullb->queues); +-} +- +-static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +-{ +- struct nullb_queue *nq = hctx->driver_data; +- struct nullb *nullb = nq->dev->nullb; +- +- nullb->nr_queues--; +-} +- +-static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) +-{ +- init_waitqueue_head(&nq->wait); +- nq->queue_depth = nullb->queue_depth; +- nq->dev = nullb->dev; +-} +- +-static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, +- unsigned int hctx_idx) +-{ +- struct nullb *nullb = hctx->queue->queuedata; +- struct nullb_queue *nq; +- +-#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION +- if (g_init_hctx_str[0] && should_fail(&null_init_hctx_attr, 1)) +- return -EFAULT; +-#endif +- +- nq = &nullb->queues[hctx_idx]; +- hctx->driver_data = nq; +- null_init_queue(nullb, nq); +- nullb->nr_queues++; +- +- return 0; +-} +- +-static const struct blk_mq_ops null_mq_ops = { +- .queue_rq = null_queue_rq, +- .complete = null_complete_rq, +- .timeout = null_timeout_rq, +- .init_hctx = null_init_hctx, +- .exit_hctx = null_exit_hctx, +-}; +- +-static void null_del_dev(struct nullb *nullb) +-{ +- struct nullb_device *dev; +- +- if (!nullb) +- return; +- +- dev = nullb->dev; +- +- ida_simple_remove(&nullb_indexes, nullb->index); +- +- list_del_init(&nullb->list); +- +- del_gendisk(nullb->disk); +- +- if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { +- hrtimer_cancel(&nullb->bw_timer); +- atomic_long_set(&nullb->cur_bytes, LONG_MAX); +- null_restart_queue_async(nullb); +- } +- +- blk_cleanup_queue(nullb->q); +- if (dev->queue_mode == NULL_Q_MQ && +- nullb->tag_set == &nullb->__tag_set) +- blk_mq_free_tag_set(nullb->tag_set); +- put_disk(nullb->disk); +- cleanup_queues(nullb); +- if (null_cache_active(nullb)) +- null_free_device_storage(nullb->dev, true); +- kfree(nullb); +- dev->nullb = NULL; +-} +- +-static void null_config_discard(struct nullb *nullb) +-{ +- if (nullb->dev->discard == false) +- return; +- +- if (nullb->dev->zoned) { +- nullb->dev->discard = false; +- pr_info("discard option is ignored in zoned mode\n"); +- return; +- } +- +- nullb->q->limits.discard_granularity = nullb->dev->blocksize; +- nullb->q->limits.discard_alignment = nullb->dev->blocksize; +- blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); +- blk_queue_flag_set(QUEUE_FLAG_DISCARD, nullb->q); +-} +- +-static const struct block_device_operations null_bio_ops = { +- .owner = THIS_MODULE, +- .submit_bio = null_submit_bio, +- .report_zones = null_report_zones, +-}; +- +-static const struct block_device_operations null_rq_ops = { +- .owner = THIS_MODULE, +- .report_zones = null_report_zones, +-}; +- +-static int setup_commands(struct nullb_queue *nq) +-{ +- struct nullb_cmd *cmd; +- int i, tag_size; +- +- nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL); +- if (!nq->cmds) +- return -ENOMEM; +- +- tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG; +- nq->tag_map = kcalloc(tag_size, sizeof(unsigned long), GFP_KERNEL); +- if (!nq->tag_map) { +- kfree(nq->cmds); +- return -ENOMEM; +- } +- +- for (i = 0; i < nq->queue_depth; i++) { +- cmd = &nq->cmds[i]; +- cmd->tag = -1U; +- } +- +- return 0; +-} +- +-static int setup_queues(struct nullb *nullb) +-{ +- nullb->queues = kcalloc(nr_cpu_ids, sizeof(struct nullb_queue), +- GFP_KERNEL); +- if (!nullb->queues) +- return -ENOMEM; +- +- nullb->queue_depth = nullb->dev->hw_queue_depth; +- +- return 0; +-} +- +-static int init_driver_queues(struct nullb *nullb) +-{ +- struct nullb_queue *nq; +- int i, ret = 0; +- +- for (i = 0; i < nullb->dev->submit_queues; i++) { +- nq = &nullb->queues[i]; +- +- null_init_queue(nullb, nq); +- +- ret = setup_commands(nq); +- if (ret) +- return ret; +- nullb->nr_queues++; +- } +- return 0; +-} +- +-static int null_gendisk_register(struct nullb *nullb) +-{ +- sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT; +- struct gendisk *disk; +- +- disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node); +- if (!disk) +- return -ENOMEM; +- set_capacity(disk, size); +- +- disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; +- disk->major = null_major; +- disk->first_minor = nullb->index; +- if (queue_is_mq(nullb->q)) +- disk->fops = &null_rq_ops; +- else +- disk->fops = &null_bio_ops; +- disk->private_data = nullb; +- disk->queue = nullb->q; +- strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); +- +- if (nullb->dev->zoned) { +- int ret = null_register_zoned_dev(nullb); +- +- if (ret) +- return ret; +- } +- +- add_disk(disk); +- return 0; +-} +- +-static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) +-{ +- set->ops = &null_mq_ops; +- set->nr_hw_queues = nullb ? nullb->dev->submit_queues : +- g_submit_queues; +- set->queue_depth = nullb ? nullb->dev->hw_queue_depth : +- g_hw_queue_depth; +- set->numa_node = nullb ? nullb->dev->home_node : g_home_node; +- set->cmd_size = sizeof(struct nullb_cmd); +- set->flags = BLK_MQ_F_SHOULD_MERGE; +- if (g_no_sched) +- set->flags |= BLK_MQ_F_NO_SCHED; +- if (g_shared_tag_bitmap) +- set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; +- set->driver_data = NULL; +- +- if ((nullb && nullb->dev->blocking) || g_blocking) +- set->flags |= BLK_MQ_F_BLOCKING; +- +- return blk_mq_alloc_tag_set(set); +-} +- +-static int null_validate_conf(struct nullb_device *dev) +-{ +- dev->blocksize = round_down(dev->blocksize, 512); +- dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); +- +- if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { +- if (dev->submit_queues != nr_online_nodes) +- dev->submit_queues = nr_online_nodes; +- } else if (dev->submit_queues > nr_cpu_ids) +- dev->submit_queues = nr_cpu_ids; +- else if (dev->submit_queues == 0) +- dev->submit_queues = 1; +- +- dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); +- dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); +- +- /* Do memory allocation, so set blocking */ +- if (dev->memory_backed) +- dev->blocking = true; +- else /* cache is meaningless */ +- dev->cache_size = 0; +- dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024, +- dev->cache_size); +- dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); +- /* can not stop a queue */ +- if (dev->queue_mode == NULL_Q_BIO) +- dev->mbps = 0; +- +- if (dev->zoned && +- (!dev->zone_size || !is_power_of_2(dev->zone_size))) { +- pr_err("zone_size must be power-of-two\n"); +- return -EINVAL; +- } +- +- return 0; +-} +- +-#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION +-static bool __null_setup_fault(struct fault_attr *attr, char *str) +-{ +- if (!str[0]) +- return true; +- +- if (!setup_fault_attr(attr, str)) +- return false; +- +- attr->verbose = 0; +- return true; +-} +-#endif +- +-static bool null_setup_fault(void) +-{ +-#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION +- if (!__null_setup_fault(&null_timeout_attr, g_timeout_str)) +- return false; +- if (!__null_setup_fault(&null_requeue_attr, g_requeue_str)) +- return false; +- if (!__null_setup_fault(&null_init_hctx_attr, g_init_hctx_str)) +- return false; +-#endif +- return true; +-} +- +-static int null_add_dev(struct nullb_device *dev) +-{ +- struct nullb *nullb; +- int rv; +- +- rv = null_validate_conf(dev); +- if (rv) +- return rv; +- +- nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); +- if (!nullb) { +- rv = -ENOMEM; +- goto out; +- } +- nullb->dev = dev; +- dev->nullb = nullb; +- +- spin_lock_init(&nullb->lock); +- +- rv = setup_queues(nullb); +- if (rv) +- goto out_free_nullb; +- +- if (dev->queue_mode == NULL_Q_MQ) { +- if (shared_tags) { +- nullb->tag_set = &tag_set; +- rv = 0; +- } else { +- nullb->tag_set = &nullb->__tag_set; +- rv = null_init_tag_set(nullb, nullb->tag_set); +- } +- +- if (rv) +- goto out_cleanup_queues; +- +- if (!null_setup_fault()) +- goto out_cleanup_queues; +- +- nullb->tag_set->timeout = 5 * HZ; +- nullb->q = blk_mq_init_queue_data(nullb->tag_set, nullb); +- if (IS_ERR(nullb->q)) { +- rv = -ENOMEM; +- goto out_cleanup_tags; +- } +- } else if (dev->queue_mode == NULL_Q_BIO) { +- nullb->q = blk_alloc_queue(dev->home_node); +- if (!nullb->q) { +- rv = -ENOMEM; +- goto out_cleanup_queues; +- } +- rv = init_driver_queues(nullb); +- if (rv) +- goto out_cleanup_blk_queue; +- } +- +- if (dev->mbps) { +- set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); +- nullb_setup_bwtimer(nullb); +- } +- +- if (dev->cache_size > 0) { +- set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); +- blk_queue_write_cache(nullb->q, true, true); +- } +- +- if (dev->zoned) { +- rv = null_init_zoned_dev(dev, nullb->q); +- if (rv) +- goto out_cleanup_blk_queue; +- } +- +- nullb->q->queuedata = nullb; +- blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); +- blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); +- +- mutex_lock(&lock); +- rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); +- if (rv < 0) { +- mutex_unlock(&lock); +- goto out_cleanup_zone; +- } +- nullb->index = rv; +- dev->index = rv; +- mutex_unlock(&lock); +- +- blk_queue_logical_block_size(nullb->q, dev->blocksize); +- blk_queue_physical_block_size(nullb->q, dev->blocksize); +- +- null_config_discard(nullb); +- +- sprintf(nullb->disk_name, "nullb%d", nullb->index); +- +- rv = null_gendisk_register(nullb); +- if (rv) +- goto out_ida_free; +- +- mutex_lock(&lock); +- list_add_tail(&nullb->list, &nullb_list); +- mutex_unlock(&lock); +- +- return 0; +- +-out_ida_free: +- ida_free(&nullb_indexes, nullb->index); +-out_cleanup_zone: +- null_free_zoned_dev(dev); +-out_cleanup_blk_queue: +- blk_cleanup_queue(nullb->q); +-out_cleanup_tags: +- if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) +- blk_mq_free_tag_set(nullb->tag_set); +-out_cleanup_queues: +- cleanup_queues(nullb); +-out_free_nullb: +- kfree(nullb); +- dev->nullb = NULL; +-out: +- return rv; +-} +- +-static int __init null_init(void) +-{ +- int ret = 0; +- unsigned int i; +- struct nullb *nullb; +- struct nullb_device *dev; +- +- if (g_bs > PAGE_SIZE) { +- pr_warn("invalid block size\n"); +- pr_warn("defaults block size to %lu\n", PAGE_SIZE); +- g_bs = PAGE_SIZE; +- } +- +- if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) { +- pr_err("invalid home_node value\n"); +- g_home_node = NUMA_NO_NODE; +- } +- +- if (g_queue_mode == NULL_Q_RQ) { +- pr_err("legacy IO path no longer available\n"); +- return -EINVAL; +- } +- if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { +- if (g_submit_queues != nr_online_nodes) { +- pr_warn("submit_queues param is set to %u.\n", +- nr_online_nodes); +- g_submit_queues = nr_online_nodes; +- } +- } else if (g_submit_queues > nr_cpu_ids) +- g_submit_queues = nr_cpu_ids; +- else if (g_submit_queues <= 0) +- g_submit_queues = 1; +- +- if (g_queue_mode == NULL_Q_MQ && shared_tags) { +- ret = null_init_tag_set(NULL, &tag_set); +- if (ret) +- return ret; +- } +- +- config_group_init(&nullb_subsys.su_group); +- mutex_init(&nullb_subsys.su_mutex); +- +- ret = configfs_register_subsystem(&nullb_subsys); +- if (ret) +- goto err_tagset; +- +- mutex_init(&lock); +- +- null_major = register_blkdev(0, "nullb"); +- if (null_major < 0) { +- ret = null_major; +- goto err_conf; +- } +- +- for (i = 0; i < nr_devices; i++) { +- dev = null_alloc_dev(); +- if (!dev) { +- ret = -ENOMEM; +- goto err_dev; +- } +- ret = null_add_dev(dev); +- if (ret) { +- null_free_dev(dev); +- goto err_dev; +- } +- } +- +- pr_info("module loaded\n"); +- return 0; +- +-err_dev: +- while (!list_empty(&nullb_list)) { +- nullb = list_entry(nullb_list.next, struct nullb, list); +- dev = nullb->dev; +- null_del_dev(nullb); +- null_free_dev(dev); +- } +- unregister_blkdev(null_major, "nullb"); +-err_conf: +- configfs_unregister_subsystem(&nullb_subsys); +-err_tagset: +- if (g_queue_mode == NULL_Q_MQ && shared_tags) +- blk_mq_free_tag_set(&tag_set); +- return ret; +-} +- +-static void __exit null_exit(void) +-{ +- struct nullb *nullb; +- +- configfs_unregister_subsystem(&nullb_subsys); +- +- unregister_blkdev(null_major, "nullb"); +- +- mutex_lock(&lock); +- while (!list_empty(&nullb_list)) { +- struct nullb_device *dev; +- +- nullb = list_entry(nullb_list.next, struct nullb, list); +- dev = nullb->dev; +- null_del_dev(nullb); +- null_free_dev(dev); +- } +- mutex_unlock(&lock); +- +- if (g_queue_mode == NULL_Q_MQ && shared_tags) +- blk_mq_free_tag_set(&tag_set); +-} +- +-module_init(null_init); +-module_exit(null_exit); +- +-MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>"); +-MODULE_LICENSE("GPL"); +diff --git a/drivers/block/null_blk_trace.c b/drivers/block/null_blk_trace.c +deleted file mode 100644 +index f246e7bff6982..0000000000000 +--- a/drivers/block/null_blk_trace.c ++++ /dev/null +@@ -1,21 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0 +-/* +- * null_blk trace related helpers. +- * +- * Copyright (C) 2020 Western Digital Corporation or its affiliates. +- */ +-#include "null_blk_trace.h" +- +-/* +- * Helper to use for all null_blk traces to extract disk name. +- */ +-const char *nullb_trace_disk_name(struct trace_seq *p, char *name) +-{ +- const char *ret = trace_seq_buffer_ptr(p); +- +- if (name && *name) +- trace_seq_printf(p, "disk=%s, ", name); +- trace_seq_putc(p, 0); +- +- return ret; +-} +diff --git a/drivers/block/null_blk_trace.h b/drivers/block/null_blk_trace.h +deleted file mode 100644 +index 4f83032eb5441..0000000000000 +--- a/drivers/block/null_blk_trace.h ++++ /dev/null +@@ -1,79 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-/* +- * null_blk device driver tracepoints. +- * +- * Copyright (C) 2020 Western Digital Corporation or its affiliates. +- */ +- +-#undef TRACE_SYSTEM +-#define TRACE_SYSTEM nullb +- +-#if !defined(_TRACE_NULLB_H) || defined(TRACE_HEADER_MULTI_READ) +-#define _TRACE_NULLB_H +- +-#include <linux/tracepoint.h> +-#include <linux/trace_seq.h> +- +-#include "null_blk.h" +- +-const char *nullb_trace_disk_name(struct trace_seq *p, char *name); +- +-#define __print_disk_name(name) nullb_trace_disk_name(p, name) +- +-#ifndef TRACE_HEADER_MULTI_READ +-static inline void __assign_disk_name(char *name, struct gendisk *disk) +-{ +- if (disk) +- memcpy(name, disk->disk_name, DISK_NAME_LEN); +- else +- memset(name, 0, DISK_NAME_LEN); +-} +-#endif +- +-TRACE_EVENT(nullb_zone_op, +- TP_PROTO(struct nullb_cmd *cmd, unsigned int zone_no, +- unsigned int zone_cond), +- TP_ARGS(cmd, zone_no, zone_cond), +- TP_STRUCT__entry( +- __array(char, disk, DISK_NAME_LEN) +- __field(enum req_opf, op) +- __field(unsigned int, zone_no) +- __field(unsigned int, zone_cond) +- ), +- TP_fast_assign( +- __entry->op = req_op(cmd->rq); +- __entry->zone_no = zone_no; +- __entry->zone_cond = zone_cond; +- __assign_disk_name(__entry->disk, cmd->rq->rq_disk); +- ), +- TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s", +- __print_disk_name(__entry->disk), +- blk_op_str(__entry->op), +- __entry->zone_no, +- blk_zone_cond_str(__entry->zone_cond)) +-); +- +-TRACE_EVENT(nullb_report_zones, +- TP_PROTO(struct nullb *nullb, unsigned int nr_zones), +- TP_ARGS(nullb, nr_zones), +- TP_STRUCT__entry( +- __array(char, disk, DISK_NAME_LEN) +- __field(unsigned int, nr_zones) +- ), +- TP_fast_assign( +- __entry->nr_zones = nr_zones; +- __assign_disk_name(__entry->disk, nullb->disk); +- ), +- TP_printk("%s nr_zones=%u", +- __print_disk_name(__entry->disk), __entry->nr_zones) +-); +- +-#endif /* _TRACE_NULLB_H */ +- +-#undef TRACE_INCLUDE_PATH +-#define TRACE_INCLUDE_PATH . +-#undef TRACE_INCLUDE_FILE +-#define TRACE_INCLUDE_FILE null_blk_trace +- +-/* This part must be outside protection */ +-#include <trace/define_trace.h> +diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c +deleted file mode 100644 +index f5df82c26c16f..0000000000000 +--- a/drivers/block/null_blk_zoned.c ++++ /dev/null +@@ -1,617 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0 +-#include <linux/vmalloc.h> +-#include <linux/bitmap.h> +-#include "null_blk.h" +- +-#define CREATE_TRACE_POINTS +-#include "null_blk_trace.h" +- +-#define MB_TO_SECTS(mb) (((sector_t)mb * SZ_1M) >> SECTOR_SHIFT) +- +-static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) +-{ +- return sect >> ilog2(dev->zone_size_sects); +-} +- +-int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) +-{ +- sector_t dev_capacity_sects, zone_capacity_sects; +- sector_t sector = 0; +- unsigned int i; +- +- if (!is_power_of_2(dev->zone_size)) { +- pr_err("zone_size must be power-of-two\n"); +- return -EINVAL; +- } +- if (dev->zone_size > dev->size) { +- pr_err("Zone size larger than device capacity\n"); +- return -EINVAL; +- } +- +- if (!dev->zone_capacity) +- dev->zone_capacity = dev->zone_size; +- +- if (dev->zone_capacity > dev->zone_size) { +- pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n", +- dev->zone_capacity, dev->zone_size); +- return -EINVAL; +- } +- +- zone_capacity_sects = MB_TO_SECTS(dev->zone_capacity); +- dev_capacity_sects = MB_TO_SECTS(dev->size); +- dev->zone_size_sects = MB_TO_SECTS(dev->zone_size); +- dev->nr_zones = dev_capacity_sects >> ilog2(dev->zone_size_sects); +- if (dev_capacity_sects & (dev->zone_size_sects - 1)) +- dev->nr_zones++; +- +- dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone), +- GFP_KERNEL | __GFP_ZERO); +- if (!dev->zones) +- return -ENOMEM; +- +- /* +- * With memory backing, the zone_lock spinlock needs to be temporarily +- * released to avoid scheduling in atomic context. To guarantee zone +- * information protection, use a bitmap to lock zones with +- * wait_on_bit_lock_io(). Sleeping on the lock is OK as memory backing +- * implies that the queue is marked with BLK_MQ_F_BLOCKING. +- */ +- spin_lock_init(&dev->zone_lock); +- if (dev->memory_backed) { +- dev->zone_locks = bitmap_zalloc(dev->nr_zones, GFP_KERNEL); +- if (!dev->zone_locks) { +- kvfree(dev->zones); +- return -ENOMEM; +- } +- } +- +- if (dev->zone_nr_conv >= dev->nr_zones) { +- dev->zone_nr_conv = dev->nr_zones - 1; +- pr_info("changed the number of conventional zones to %u", +- dev->zone_nr_conv); +- } +- +- /* Max active zones has to be < nbr of seq zones in order to be enforceable */ +- if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) { +- dev->zone_max_active = 0; +- pr_info("zone_max_active limit disabled, limit >= zone count\n"); +- } +- +- /* Max open zones has to be <= max active zones */ +- if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) { +- dev->zone_max_open = dev->zone_max_active; +- pr_info("changed the maximum number of open zones to %u\n", +- dev->nr_zones); +- } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) { +- dev->zone_max_open = 0; +- pr_info("zone_max_open limit disabled, limit >= zone count\n"); +- } +- +- for (i = 0; i < dev->zone_nr_conv; i++) { +- struct blk_zone *zone = &dev->zones[i]; +- +- zone->start = sector; +- zone->len = dev->zone_size_sects; +- zone->capacity = zone->len; +- zone->wp = zone->start + zone->len; +- zone->type = BLK_ZONE_TYPE_CONVENTIONAL; +- zone->cond = BLK_ZONE_COND_NOT_WP; +- +- sector += dev->zone_size_sects; +- } +- +- for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { +- struct blk_zone *zone = &dev->zones[i]; +- +- zone->start = zone->wp = sector; +- if (zone->start + dev->zone_size_sects > dev_capacity_sects) +- zone->len = dev_capacity_sects - zone->start; +- else +- zone->len = dev->zone_size_sects; +- zone->capacity = +- min_t(sector_t, zone->len, zone_capacity_sects); +- zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; +- zone->cond = BLK_ZONE_COND_EMPTY; +- +- sector += dev->zone_size_sects; +- } +- +- q->limits.zoned = BLK_ZONED_HM; +- blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); +- blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); +- +- return 0; +-} +- +-int null_register_zoned_dev(struct nullb *nullb) +-{ +- struct nullb_device *dev = nullb->dev; +- struct request_queue *q = nullb->q; +- +- if (queue_is_mq(q)) { +- int ret = blk_revalidate_disk_zones(nullb->disk, NULL); +- +- if (ret) +- return ret; +- } else { +- blk_queue_chunk_sectors(q, dev->zone_size_sects); +- q->nr_zones = blkdev_nr_zones(nullb->disk); +- } +- +- blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); +- blk_queue_max_open_zones(q, dev->zone_max_open); +- blk_queue_max_active_zones(q, dev->zone_max_active); +- +- return 0; +-} +- +-void null_free_zoned_dev(struct nullb_device *dev) +-{ +- bitmap_free(dev->zone_locks); +- kvfree(dev->zones); +- dev->zones = NULL; +-} +- +-static inline void null_lock_zone(struct nullb_device *dev, unsigned int zno) +-{ +- if (dev->memory_backed) +- wait_on_bit_lock_io(dev->zone_locks, zno, TASK_UNINTERRUPTIBLE); +- spin_lock_irq(&dev->zone_lock); +-} +- +-static inline void null_unlock_zone(struct nullb_device *dev, unsigned int zno) +-{ +- spin_unlock_irq(&dev->zone_lock); +- +- if (dev->memory_backed) +- clear_and_wake_up_bit(zno, dev->zone_locks); +-} +- +-int null_report_zones(struct gendisk *disk, sector_t sector, +- unsigned int nr_zones, report_zones_cb cb, void *data) +-{ +- struct nullb *nullb = disk->private_data; +- struct nullb_device *dev = nullb->dev; +- unsigned int first_zone, i, zno; +- struct blk_zone zone; +- int error; +- +- first_zone = null_zone_no(dev, sector); +- if (first_zone >= dev->nr_zones) +- return 0; +- +- nr_zones = min(nr_zones, dev->nr_zones - first_zone); +- trace_nullb_report_zones(nullb, nr_zones); +- +- zno = first_zone; +- for (i = 0; i < nr_zones; i++, zno++) { +- /* +- * Stacked DM target drivers will remap the zone information by +- * modifying the zone information passed to the report callback. +- * So use a local copy to avoid corruption of the device zone +- * array. +- */ +- null_lock_zone(dev, zno); +- memcpy(&zone, &dev->zones[zno], sizeof(struct blk_zone)); +- null_unlock_zone(dev, zno); +- +- error = cb(&zone, i, data); +- if (error) +- return error; +- } +- +- return nr_zones; +-} +- +-/* +- * This is called in the case of memory backing from null_process_cmd() +- * with the target zone already locked. +- */ +-size_t null_zone_valid_read_len(struct nullb *nullb, +- sector_t sector, unsigned int len) +-{ +- struct nullb_device *dev = nullb->dev; +- struct blk_zone *zone = &dev->zones[null_zone_no(dev, sector)]; +- unsigned int nr_sectors = len >> SECTOR_SHIFT; +- +- /* Read must be below the write pointer position */ +- if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL || +- sector + nr_sectors <= zone->wp) +- return len; +- +- if (sector > zone->wp) +- return 0; +- +- return (zone->wp - sector) << SECTOR_SHIFT; +-} +- +-static blk_status_t null_close_zone(struct nullb_device *dev, struct blk_zone *zone) +-{ +- if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) +- return BLK_STS_IOERR; +- +- switch (zone->cond) { +- case BLK_ZONE_COND_CLOSED: +- /* close operation on closed is not an error */ +- return BLK_STS_OK; +- case BLK_ZONE_COND_IMP_OPEN: +- dev->nr_zones_imp_open--; +- break; +- case BLK_ZONE_COND_EXP_OPEN: +- dev->nr_zones_exp_open--; +- break; +- case BLK_ZONE_COND_EMPTY: +- case BLK_ZONE_COND_FULL: +- default: +- return BLK_STS_IOERR; +- } +- +- if (zone->wp == zone->start) { +- zone->cond = BLK_ZONE_COND_EMPTY; +- } else { +- zone->cond = BLK_ZONE_COND_CLOSED; +- dev->nr_zones_closed++; +- } +- +- return BLK_STS_OK; +-} +- +-static void null_close_first_imp_zone(struct nullb_device *dev) +-{ +- unsigned int i; +- +- for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { +- if (dev->zones[i].cond == BLK_ZONE_COND_IMP_OPEN) { +- null_close_zone(dev, &dev->zones[i]); +- return; +- } +- } +-} +- +-static blk_status_t null_check_active(struct nullb_device *dev) +-{ +- if (!dev->zone_max_active) +- return BLK_STS_OK; +- +- if (dev->nr_zones_exp_open + dev->nr_zones_imp_open + +- dev->nr_zones_closed < dev->zone_max_active) +- return BLK_STS_OK; +- +- return BLK_STS_ZONE_ACTIVE_RESOURCE; +-} +- +-static blk_status_t null_check_open(struct nullb_device *dev) +-{ +- if (!dev->zone_max_open) +- return BLK_STS_OK; +- +- if (dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open) +- return BLK_STS_OK; +- +- if (dev->nr_zones_imp_open) { +- if (null_check_active(dev) == BLK_STS_OK) { +- null_close_first_imp_zone(dev); +- return BLK_STS_OK; +- } +- } +- +- return BLK_STS_ZONE_OPEN_RESOURCE; +-} +- +-/* +- * This function matches the manage open zone resources function in the ZBC standard, +- * with the addition of max active zones support (added in the ZNS standard). +- * +- * The function determines if a zone can transition to implicit open or explicit open, +- * while maintaining the max open zone (and max active zone) limit(s). It may close an +- * implicit open zone in order to make additional zone resources available. +- * +- * ZBC states that an implicit open zone shall be closed only if there is not +- * room within the open limit. However, with the addition of an active limit, +- * it is not certain that closing an implicit open zone will allow a new zone +- * to be opened, since we might already be at the active limit capacity. +- */ +-static blk_status_t null_check_zone_resources(struct nullb_device *dev, struct blk_zone *zone) +-{ +- blk_status_t ret; +- +- switch (zone->cond) { +- case BLK_ZONE_COND_EMPTY: +- ret = null_check_active(dev); +- if (ret != BLK_STS_OK) +- return ret; +- fallthrough; +- case BLK_ZONE_COND_CLOSED: +- return null_check_open(dev); +- default: +- /* Should never be called for other states */ +- WARN_ON(1); +- return BLK_STS_IOERR; +- } +-} +- +-static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, +- unsigned int nr_sectors, bool append) +-{ +- struct nullb_device *dev = cmd->nq->dev; +- unsigned int zno = null_zone_no(dev, sector); +- struct blk_zone *zone = &dev->zones[zno]; +- blk_status_t ret; +- +- trace_nullb_zone_op(cmd, zno, zone->cond); +- +- if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) { +- if (append) +- return BLK_STS_IOERR; +- return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); +- } +- +- null_lock_zone(dev, zno); +- +- switch (zone->cond) { +- case BLK_ZONE_COND_FULL: +- /* Cannot write to a full zone */ +- ret = BLK_STS_IOERR; +- goto unlock; +- case BLK_ZONE_COND_EMPTY: +- case BLK_ZONE_COND_CLOSED: +- ret = null_check_zone_resources(dev, zone); +- if (ret != BLK_STS_OK) +- goto unlock; +- break; +- case BLK_ZONE_COND_IMP_OPEN: +- case BLK_ZONE_COND_EXP_OPEN: +- break; +- default: +- /* Invalid zone condition */ +- ret = BLK_STS_IOERR; +- goto unlock; +- } +- +- /* +- * Regular writes must be at the write pointer position. +- * Zone append writes are automatically issued at the write +- * pointer and the position returned using the request or BIO +- * sector. +- */ +- if (append) { +- sector = zone->wp; +- if (cmd->bio) +- cmd->bio->bi_iter.bi_sector = sector; +- else +- cmd->rq->__sector = sector; +- } else if (sector != zone->wp) { +- ret = BLK_STS_IOERR; +- goto unlock; +- } +- +- if (zone->wp + nr_sectors > zone->start + zone->capacity) { +- ret = BLK_STS_IOERR; +- goto unlock; +- } +- +- if (zone->cond == BLK_ZONE_COND_CLOSED) { +- dev->nr_zones_closed--; +- dev->nr_zones_imp_open++; +- } else if (zone->cond == BLK_ZONE_COND_EMPTY) { +- dev->nr_zones_imp_open++; +- } +- if (zone->cond != BLK_ZONE_COND_EXP_OPEN) +- zone->cond = BLK_ZONE_COND_IMP_OPEN; +- +- /* +- * Memory backing allocation may sleep: release the zone_lock spinlock +- * to avoid scheduling in atomic context. Zone operation atomicity is +- * still guaranteed through the zone_locks bitmap. +- */ +- if (dev->memory_backed) +- spin_unlock_irq(&dev->zone_lock); +- ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); +- if (dev->memory_backed) +- spin_lock_irq(&dev->zone_lock); +- +- if (ret != BLK_STS_OK) +- goto unlock; +- +- zone->wp += nr_sectors; +- if (zone->wp == zone->start + zone->capacity) { +- if (zone->cond == BLK_ZONE_COND_EXP_OPEN) +- dev->nr_zones_exp_open--; +- else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) +- dev->nr_zones_imp_open--; +- zone->cond = BLK_ZONE_COND_FULL; +- } +- ret = BLK_STS_OK; +- +-unlock: +- null_unlock_zone(dev, zno); +- +- return ret; +-} +- +-static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zone) +-{ +- blk_status_t ret; +- +- if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) +- return BLK_STS_IOERR; +- +- switch (zone->cond) { +- case BLK_ZONE_COND_EXP_OPEN: +- /* open operation on exp open is not an error */ +- return BLK_STS_OK; +- case BLK_ZONE_COND_EMPTY: +- ret = null_check_zone_resources(dev, zone); +- if (ret != BLK_STS_OK) +- return ret; +- break; +- case BLK_ZONE_COND_IMP_OPEN: +- dev->nr_zones_imp_open--; +- break; +- case BLK_ZONE_COND_CLOSED: +- ret = null_check_zone_resources(dev, zone); +- if (ret != BLK_STS_OK) +- return ret; +- dev->nr_zones_closed--; +- break; +- case BLK_ZONE_COND_FULL: +- default: +- return BLK_STS_IOERR; +- } +- +- zone->cond = BLK_ZONE_COND_EXP_OPEN; +- dev->nr_zones_exp_open++; +- +- return BLK_STS_OK; +-} +- +-static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone *zone) +-{ +- blk_status_t ret; +- +- if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) +- return BLK_STS_IOERR; +- +- switch (zone->cond) { +- case BLK_ZONE_COND_FULL: +- /* finish operation on full is not an error */ +- return BLK_STS_OK; +- case BLK_ZONE_COND_EMPTY: +- ret = null_check_zone_resources(dev, zone); +- if (ret != BLK_STS_OK) +- return ret; +- break; +- case BLK_ZONE_COND_IMP_OPEN: +- dev->nr_zones_imp_open--; +- break; +- case BLK_ZONE_COND_EXP_OPEN: +- dev->nr_zones_exp_open--; +- break; +- case BLK_ZONE_COND_CLOSED: +- ret = null_check_zone_resources(dev, zone); +- if (ret != BLK_STS_OK) +- return ret; +- dev->nr_zones_closed--; +- break; +- default: +- return BLK_STS_IOERR; +- } +- +- zone->cond = BLK_ZONE_COND_FULL; +- zone->wp = zone->start + zone->len; +- +- return BLK_STS_OK; +-} +- +-static blk_status_t null_reset_zone(struct nullb_device *dev, struct blk_zone *zone) +-{ +- if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) +- return BLK_STS_IOERR; +- +- switch (zone->cond) { +- case BLK_ZONE_COND_EMPTY: +- /* reset operation on empty is not an error */ +- return BLK_STS_OK; +- case BLK_ZONE_COND_IMP_OPEN: +- dev->nr_zones_imp_open--; +- break; +- case BLK_ZONE_COND_EXP_OPEN: +- dev->nr_zones_exp_open--; +- break; +- case BLK_ZONE_COND_CLOSED: +- dev->nr_zones_closed--; +- break; +- case BLK_ZONE_COND_FULL: +- break; +- default: +- return BLK_STS_IOERR; +- } +- +- zone->cond = BLK_ZONE_COND_EMPTY; +- zone->wp = zone->start; +- +- return BLK_STS_OK; +-} +- +-static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, +- sector_t sector) +-{ +- struct nullb_device *dev = cmd->nq->dev; +- unsigned int zone_no; +- struct blk_zone *zone; +- blk_status_t ret; +- size_t i; +- +- if (op == REQ_OP_ZONE_RESET_ALL) { +- for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { +- null_lock_zone(dev, i); +- zone = &dev->zones[i]; +- if (zone->cond != BLK_ZONE_COND_EMPTY) { +- null_reset_zone(dev, zone); +- trace_nullb_zone_op(cmd, i, zone->cond); +- } +- null_unlock_zone(dev, i); +- } +- return BLK_STS_OK; +- } +- +- zone_no = null_zone_no(dev, sector); +- zone = &dev->zones[zone_no]; +- +- null_lock_zone(dev, zone_no); +- +- switch (op) { +- case REQ_OP_ZONE_RESET: +- ret = null_reset_zone(dev, zone); +- break; +- case REQ_OP_ZONE_OPEN: +- ret = null_open_zone(dev, zone); +- break; +- case REQ_OP_ZONE_CLOSE: +- ret = null_close_zone(dev, zone); +- break; +- case REQ_OP_ZONE_FINISH: +- ret = null_finish_zone(dev, zone); +- break; +- default: +- ret = BLK_STS_NOTSUPP; +- break; +- } +- +- if (ret == BLK_STS_OK) +- trace_nullb_zone_op(cmd, zone_no, zone->cond); +- +- null_unlock_zone(dev, zone_no); +- +- return ret; +-} +- +-blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, +- sector_t sector, sector_t nr_sectors) +-{ +- struct nullb_device *dev = cmd->nq->dev; +- unsigned int zno = null_zone_no(dev, sector); +- blk_status_t sts; +- +- switch (op) { +- case REQ_OP_WRITE: +- sts = null_zone_write(cmd, sector, nr_sectors, false); +- break; +- case REQ_OP_ZONE_APPEND: +- sts = null_zone_write(cmd, sector, nr_sectors, true); +- break; +- case REQ_OP_ZONE_RESET: +- case REQ_OP_ZONE_RESET_ALL: +- case REQ_OP_ZONE_OPEN: +- case REQ_OP_ZONE_CLOSE: +- case REQ_OP_ZONE_FINISH: +- sts = null_zone_mgmt(cmd, op, sector); +- break; +- default: +- null_lock_zone(dev, zno); +- sts = null_process_cmd(cmd, op, sector, nr_sectors); +- null_unlock_zone(dev, zno); +- } +- +- return sts; +-} +diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c +index 39aeebc6837da..d9e41d3bbe717 100644 +--- a/drivers/block/sunvdc.c ++++ b/drivers/block/sunvdc.c +@@ -984,6 +984,8 @@ static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id) + print_version(); + + hp = mdesc_grab(); ++ if (!hp) ++ return -ENODEV; + + err = -ENODEV; + if ((vdev->dev_no << PARTITION_SHIFT) & ~(u64)MINORMASK) { +diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig +index c715d4681a0b8..4ae49eae45869 100644 +--- a/drivers/clk/Kconfig ++++ b/drivers/clk/Kconfig +@@ -79,7 +79,7 @@ config COMMON_CLK_RK808 + config COMMON_CLK_HI655X + tristate "Clock driver for Hi655x" if EXPERT + depends on (MFD_HI655X_PMIC || COMPILE_TEST) +- depends on REGMAP ++ select REGMAP + default MFD_HI655X_PMIC + help + This driver supports the hi655x PMIC clock. This +diff --git a/drivers/cpuidle/cpuidle-psci-domain.c b/drivers/cpuidle/cpuidle-psci-domain.c +index 4a031c62f92a1..5098639d41f12 100644 +--- a/drivers/cpuidle/cpuidle-psci-domain.c ++++ b/drivers/cpuidle/cpuidle-psci-domain.c +@@ -182,7 +182,8 @@ static void psci_pd_remove(void) + struct psci_pd_provider *pd_provider, *it; + struct generic_pm_domain *genpd; + +- list_for_each_entry_safe(pd_provider, it, &psci_pd_providers, link) { ++ list_for_each_entry_safe_reverse(pd_provider, it, ++ &psci_pd_providers, link) { + of_genpd_del_provider(pd_provider->node); + + genpd = of_genpd_remove_last(pd_provider->node); +diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c +index 9e6504592646e..300ba2991936b 100644 +--- a/drivers/firmware/xilinx/zynqmp.c ++++ b/drivers/firmware/xilinx/zynqmp.c +@@ -171,7 +171,7 @@ static int zynqmp_pm_feature(u32 api_id) + } + + /* Add new entry if not present */ +- feature_data = kmalloc(sizeof(*feature_data), GFP_KERNEL); ++ feature_data = kmalloc(sizeof(*feature_data), GFP_ATOMIC); + if (!feature_data) + return -ENOMEM; + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c +index 159be13ef20bb..2c19b3775179b 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c +@@ -528,16 +528,13 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) + struct kfd_event_waiter *event_waiters; + uint32_t i; + +- event_waiters = kmalloc_array(num_events, +- sizeof(struct kfd_event_waiter), +- GFP_KERNEL); ++ event_waiters = kcalloc(num_events, sizeof(struct kfd_event_waiter), ++ GFP_KERNEL); + if (!event_waiters) + return NULL; + +- for (i = 0; (event_waiters) && (i < num_events) ; i++) { ++ for (i = 0; i < num_events; i++) + init_wait(&event_waiters[i].wait); +- event_waiters[i].activated = false; +- } + + return event_waiters; + } +diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_mode_vba_30.c b/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_mode_vba_30.c +index e427f4ffa0807..e5b1002d7f3f0 100644 +--- a/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_mode_vba_30.c ++++ b/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_mode_vba_30.c +@@ -1868,7 +1868,10 @@ static unsigned int CalculateVMAndRowBytes( + } + + if (SurfaceTiling == dm_sw_linear) { +- *dpte_row_height = dml_min(128, 1 << (unsigned int) dml_floor(dml_log2(PTEBufferSizeInRequests * *PixelPTEReqWidth / Pitch), 1)); ++ if (PTEBufferSizeInRequests == 0) ++ *dpte_row_height = 1; ++ else ++ *dpte_row_height = dml_min(128, 1 << (unsigned int) dml_floor(dml_log2(PTEBufferSizeInRequests * *PixelPTEReqWidth / Pitch), 1)); + *dpte_row_width_ub = (dml_ceil(((double) SwathWidth - 1) / *PixelPTEReqWidth, 1) + 1) * *PixelPTEReqWidth; + *PixelPTEBytesPerRow = *dpte_row_width_ub / *PixelPTEReqWidth * *PTERequestSize; + } else if (ScanDirection != dm_vert) { +diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c +index c56656a95cf99..b7bb5610dfe21 100644 +--- a/drivers/gpu/drm/drm_gem_shmem_helper.c ++++ b/drivers/gpu/drm/drm_gem_shmem_helper.c +@@ -614,11 +614,14 @@ int drm_gem_shmem_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma) + int ret; + + if (obj->import_attach) { +- /* Drop the reference drm_gem_mmap_obj() acquired.*/ +- drm_gem_object_put(obj); + vma->vm_private_data = NULL; ++ ret = dma_buf_mmap(obj->dma_buf, vma, 0); ++ ++ /* Drop the reference drm_gem_mmap_obj() acquired.*/ ++ if (!ret) ++ drm_gem_object_put(obj); + +- return dma_buf_mmap(obj->dma_buf, vma, 0); ++ return ret; + } + + shmem = to_drm_gem_shmem_obj(obj); +diff --git a/drivers/gpu/drm/i915/gt/intel_ring.c b/drivers/gpu/drm/i915/gt/intel_ring.c +index 69b2e5509d678..de67b2745258f 100644 +--- a/drivers/gpu/drm/i915/gt/intel_ring.c ++++ b/drivers/gpu/drm/i915/gt/intel_ring.c +@@ -108,7 +108,7 @@ static struct i915_vma *create_ring_vma(struct i915_ggtt *ggtt, int size) + struct i915_vma *vma; + + obj = ERR_PTR(-ENODEV); +- if (i915_ggtt_has_aperture(ggtt)) ++ if (i915_ggtt_has_aperture(ggtt) && !HAS_LLC(i915)) + obj = i915_gem_object_create_stolen(i915, size); + if (IS_ERR(obj)) + obj = i915_gem_object_create_internal(i915, size); +diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c +index c4c2d24dc5094..0532a5069c04b 100644 +--- a/drivers/gpu/drm/i915/i915_active.c ++++ b/drivers/gpu/drm/i915/i915_active.c +@@ -432,8 +432,7 @@ replace_barrier(struct i915_active *ref, struct i915_active_fence *active) + * we can use it to substitute for the pending idle-barrer + * request that we want to emit on the kernel_context. + */ +- __active_del_barrier(ref, node_from_active(active)); +- return true; ++ return __active_del_barrier(ref, node_from_active(active)); + } + + int i915_active_ref(struct i915_active *ref, u64 idx, struct dma_fence *fence) +@@ -446,16 +445,19 @@ int i915_active_ref(struct i915_active *ref, u64 idx, struct dma_fence *fence) + if (err) + return err; + +- active = active_instance(ref, idx); +- if (!active) { +- err = -ENOMEM; +- goto out; +- } ++ do { ++ active = active_instance(ref, idx); ++ if (!active) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ if (replace_barrier(ref, active)) { ++ RCU_INIT_POINTER(active->fence, NULL); ++ atomic_dec(&ref->count); ++ } ++ } while (unlikely(is_barrier(active))); + +- if (replace_barrier(ref, active)) { +- RCU_INIT_POINTER(active->fence, NULL); +- atomic_dec(&ref->count); +- } + if (!__i915_active_fence_set(active, fence)) + __i915_active_acquire(ref); + +diff --git a/drivers/gpu/drm/meson/meson_vpp.c b/drivers/gpu/drm/meson/meson_vpp.c +index 154837688ab0d..5df1957c8e41f 100644 +--- a/drivers/gpu/drm/meson/meson_vpp.c ++++ b/drivers/gpu/drm/meson/meson_vpp.c +@@ -100,6 +100,8 @@ void meson_vpp_init(struct meson_drm *priv) + priv->io_base + _REG(VPP_DOLBY_CTRL)); + writel_relaxed(0x1020080, + priv->io_base + _REG(VPP_DUMMY_DATA1)); ++ writel_relaxed(0x42020, ++ priv->io_base + _REG(VPP_DUMMY_DATA)); + } else if (meson_vpu_is_compatible(priv, VPU_COMPATIBLE_G12A)) + writel_relaxed(0xf, priv->io_base + _REG(DOLBY_PATH_CTRL)); + +diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c +index 13596961ae17f..5ff856ef7d88c 100644 +--- a/drivers/gpu/drm/panfrost/panfrost_mmu.c ++++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c +@@ -236,7 +236,7 @@ static void panfrost_mmu_flush_range(struct panfrost_device *pfdev, + if (pm_runtime_active(pfdev->dev)) + mmu_hw_do_operation(pfdev, mmu, iova, size, AS_COMMAND_FLUSH_PT); + +- pm_runtime_put_sync_autosuspend(pfdev->dev); ++ pm_runtime_put_autosuspend(pfdev->dev); + } + + static int mmu_map_sg(struct panfrost_device *pfdev, struct panfrost_mmu *mmu, +diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c +index 5f9ec1d1464a2..524d6d712e724 100644 +--- a/drivers/hid/hid-core.c ++++ b/drivers/hid/hid-core.c +@@ -258,6 +258,7 @@ static int hid_add_field(struct hid_parser *parser, unsigned report_type, unsign + { + struct hid_report *report; + struct hid_field *field; ++ unsigned int max_buffer_size = HID_MAX_BUFFER_SIZE; + unsigned int usages; + unsigned int offset; + unsigned int i; +@@ -288,8 +289,11 @@ static int hid_add_field(struct hid_parser *parser, unsigned report_type, unsign + offset = report->size; + report->size += parser->global.report_size * parser->global.report_count; + ++ if (parser->device->ll_driver->max_buffer_size) ++ max_buffer_size = parser->device->ll_driver->max_buffer_size; ++ + /* Total size check: Allow for possible report index byte */ +- if (report->size > (HID_MAX_BUFFER_SIZE - 1) << 3) { ++ if (report->size > (max_buffer_size - 1) << 3) { + hid_err(parser->device, "report is too long\n"); + return -1; + } +@@ -1752,6 +1756,7 @@ int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, u32 size, + struct hid_report_enum *report_enum = hid->report_enum + type; + struct hid_report *report; + struct hid_driver *hdrv; ++ int max_buffer_size = HID_MAX_BUFFER_SIZE; + unsigned int a; + u32 rsize, csize = size; + u8 *cdata = data; +@@ -1768,10 +1773,13 @@ int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, u32 size, + + rsize = hid_compute_report_size(report); + +- if (report_enum->numbered && rsize >= HID_MAX_BUFFER_SIZE) +- rsize = HID_MAX_BUFFER_SIZE - 1; +- else if (rsize > HID_MAX_BUFFER_SIZE) +- rsize = HID_MAX_BUFFER_SIZE; ++ if (hid->ll_driver->max_buffer_size) ++ max_buffer_size = hid->ll_driver->max_buffer_size; ++ ++ if (report_enum->numbered && rsize >= max_buffer_size) ++ rsize = max_buffer_size - 1; ++ else if (rsize > max_buffer_size) ++ rsize = max_buffer_size; + + if (csize < rsize) { + dbg_hid("report %d is too short, (%d < %d)\n", report->id, +diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c +index fc06d8bb42e0f..ba0ca652b9dab 100644 +--- a/drivers/hid/uhid.c ++++ b/drivers/hid/uhid.c +@@ -395,6 +395,7 @@ struct hid_ll_driver uhid_hid_driver = { + .parse = uhid_hid_parse, + .raw_request = uhid_hid_raw_request, + .output_report = uhid_hid_output_report, ++ .max_buffer_size = UHID_DATA_MAX, + }; + EXPORT_SYMBOL_GPL(uhid_hid_driver); + +diff --git a/drivers/hwmon/adt7475.c b/drivers/hwmon/adt7475.c +index 9d5b019651f2d..6b84822e7d93b 100644 +--- a/drivers/hwmon/adt7475.c ++++ b/drivers/hwmon/adt7475.c +@@ -486,10 +486,10 @@ static ssize_t temp_store(struct device *dev, struct device_attribute *attr, + val = (temp - val) / 1000; + + if (sattr->index != 1) { +- data->temp[HYSTERSIS][sattr->index] &= 0xF0; ++ data->temp[HYSTERSIS][sattr->index] &= 0x0F; + data->temp[HYSTERSIS][sattr->index] |= (val & 0xF) << 4; + } else { +- data->temp[HYSTERSIS][sattr->index] &= 0x0F; ++ data->temp[HYSTERSIS][sattr->index] &= 0xF0; + data->temp[HYSTERSIS][sattr->index] |= (val & 0xF); + } + +@@ -554,11 +554,11 @@ static ssize_t temp_st_show(struct device *dev, struct device_attribute *attr, + val = data->enh_acoustics[0] & 0xf; + break; + case 1: +- val = (data->enh_acoustics[1] >> 4) & 0xf; ++ val = data->enh_acoustics[1] & 0xf; + break; + case 2: + default: +- val = data->enh_acoustics[1] & 0xf; ++ val = (data->enh_acoustics[1] >> 4) & 0xf; + break; + } + +diff --git a/drivers/hwmon/ina3221.c b/drivers/hwmon/ina3221.c +index d3c98115042b5..836e7579e166a 100644 +--- a/drivers/hwmon/ina3221.c ++++ b/drivers/hwmon/ina3221.c +@@ -772,7 +772,7 @@ static int ina3221_probe_child_from_dt(struct device *dev, + return ret; + } else if (val > INA3221_CHANNEL3) { + dev_err(dev, "invalid reg %d of %pOFn\n", val, child); +- return ret; ++ return -EINVAL; + } + + input = &ina->inputs[val]; +diff --git a/drivers/hwmon/pmbus/adm1266.c b/drivers/hwmon/pmbus/adm1266.c +index c7b373ba92f21..d1b2e936546fd 100644 +--- a/drivers/hwmon/pmbus/adm1266.c ++++ b/drivers/hwmon/pmbus/adm1266.c +@@ -301,6 +301,7 @@ static int adm1266_config_gpio(struct adm1266_data *data) + data->gc.label = name; + data->gc.parent = &data->client->dev; + data->gc.owner = THIS_MODULE; ++ data->gc.can_sleep = true; + data->gc.base = -1; + data->gc.names = data->gpio_names; + data->gc.ngpio = ARRAY_SIZE(data->gpio_names); +diff --git a/drivers/hwmon/pmbus/ucd9000.c b/drivers/hwmon/pmbus/ucd9000.c +index f8017993e2b4d..9e26cc084a176 100644 +--- a/drivers/hwmon/pmbus/ucd9000.c ++++ b/drivers/hwmon/pmbus/ucd9000.c +@@ -7,6 +7,7 @@ + */ + + #include <linux/debugfs.h> ++#include <linux/delay.h> + #include <linux/kernel.h> + #include <linux/module.h> + #include <linux/of_device.h> +@@ -16,6 +17,7 @@ + #include <linux/i2c.h> + #include <linux/pmbus.h> + #include <linux/gpio/driver.h> ++#include <linux/timekeeping.h> + #include "pmbus.h" + + enum chips { ucd9000, ucd90120, ucd90124, ucd90160, ucd90320, ucd9090, +@@ -65,6 +67,7 @@ struct ucd9000_data { + struct gpio_chip gpio; + #endif + struct dentry *debugfs; ++ ktime_t write_time; + }; + #define to_ucd9000_data(_info) container_of(_info, struct ucd9000_data, info) + +@@ -73,6 +76,73 @@ struct ucd9000_debugfs_entry { + u8 index; + }; + ++/* ++ * It has been observed that the UCD90320 randomly fails register access when ++ * doing another access right on the back of a register write. To mitigate this ++ * make sure that there is a minimum delay between a write access and the ++ * following access. The 250us is based on experimental data. At a delay of ++ * 200us the issue seems to go away. Add a bit of extra margin to allow for ++ * system to system differences. ++ */ ++#define UCD90320_WAIT_DELAY_US 250 ++ ++static inline void ucd90320_wait(const struct ucd9000_data *data) ++{ ++ s64 delta = ktime_us_delta(ktime_get(), data->write_time); ++ ++ if (delta < UCD90320_WAIT_DELAY_US) ++ udelay(UCD90320_WAIT_DELAY_US - delta); ++} ++ ++static int ucd90320_read_word_data(struct i2c_client *client, int page, ++ int phase, int reg) ++{ ++ const struct pmbus_driver_info *info = pmbus_get_driver_info(client); ++ struct ucd9000_data *data = to_ucd9000_data(info); ++ ++ if (reg >= PMBUS_VIRT_BASE) ++ return -ENXIO; ++ ++ ucd90320_wait(data); ++ return pmbus_read_word_data(client, page, phase, reg); ++} ++ ++static int ucd90320_read_byte_data(struct i2c_client *client, int page, int reg) ++{ ++ const struct pmbus_driver_info *info = pmbus_get_driver_info(client); ++ struct ucd9000_data *data = to_ucd9000_data(info); ++ ++ ucd90320_wait(data); ++ return pmbus_read_byte_data(client, page, reg); ++} ++ ++static int ucd90320_write_word_data(struct i2c_client *client, int page, ++ int reg, u16 word) ++{ ++ const struct pmbus_driver_info *info = pmbus_get_driver_info(client); ++ struct ucd9000_data *data = to_ucd9000_data(info); ++ int ret; ++ ++ ucd90320_wait(data); ++ ret = pmbus_write_word_data(client, page, reg, word); ++ data->write_time = ktime_get(); ++ ++ return ret; ++} ++ ++static int ucd90320_write_byte(struct i2c_client *client, int page, u8 value) ++{ ++ const struct pmbus_driver_info *info = pmbus_get_driver_info(client); ++ struct ucd9000_data *data = to_ucd9000_data(info); ++ int ret; ++ ++ ucd90320_wait(data); ++ ret = pmbus_write_byte(client, page, value); ++ data->write_time = ktime_get(); ++ ++ return ret; ++} ++ + static int ucd9000_get_fan_config(struct i2c_client *client, int fan) + { + int fan_config = 0; +@@ -598,6 +668,11 @@ static int ucd9000_probe(struct i2c_client *client) + info->read_byte_data = ucd9000_read_byte_data; + info->func[0] |= PMBUS_HAVE_FAN12 | PMBUS_HAVE_STATUS_FAN12 + | PMBUS_HAVE_FAN34 | PMBUS_HAVE_STATUS_FAN34; ++ } else if (mid->driver_data == ucd90320) { ++ info->read_byte_data = ucd90320_read_byte_data; ++ info->read_word_data = ucd90320_read_word_data; ++ info->write_byte = ucd90320_write_byte; ++ info->write_word_data = ucd90320_write_word_data; + } + + ucd9000_probe_gpio(client, mid, data); +diff --git a/drivers/hwmon/tmp513.c b/drivers/hwmon/tmp513.c +index 47bbe47e062fd..7d5f7441aceb1 100644 +--- a/drivers/hwmon/tmp513.c ++++ b/drivers/hwmon/tmp513.c +@@ -758,7 +758,7 @@ static int tmp51x_probe(struct i2c_client *client) + static struct i2c_driver tmp51x_driver = { + .driver = { + .name = "tmp51x", +- .of_match_table = of_match_ptr(tmp51x_of_match), ++ .of_match_table = tmp51x_of_match, + }, + .probe_new = tmp51x_probe, + .id_table = tmp51x_id, +diff --git a/drivers/hwmon/xgene-hwmon.c b/drivers/hwmon/xgene-hwmon.c +index f2a5af239c956..f5d3cf86753f7 100644 +--- a/drivers/hwmon/xgene-hwmon.c ++++ b/drivers/hwmon/xgene-hwmon.c +@@ -768,6 +768,7 @@ static int xgene_hwmon_remove(struct platform_device *pdev) + { + struct xgene_hwmon_dev *ctx = platform_get_drvdata(pdev); + ++ cancel_work_sync(&ctx->workq); + hwmon_device_unregister(ctx->hwmon_dev); + kfifo_free(&ctx->async_msg_fifo); + if (acpi_disabled) +diff --git a/drivers/interconnect/core.c b/drivers/interconnect/core.c +index ceb6cdc20484e..7db6d0fc6ec2e 100644 +--- a/drivers/interconnect/core.c ++++ b/drivers/interconnect/core.c +@@ -850,6 +850,10 @@ void icc_node_destroy(int id) + + mutex_unlock(&icc_lock); + ++ if (!node) ++ return; ++ ++ kfree(node->links); + kfree(node); + } + EXPORT_SYMBOL_GPL(icc_node_destroy); +diff --git a/drivers/media/i2c/m5mols/m5mols_core.c b/drivers/media/i2c/m5mols/m5mols_core.c +index 21666d705e372..dcf9e4d4ee6b8 100644 +--- a/drivers/media/i2c/m5mols/m5mols_core.c ++++ b/drivers/media/i2c/m5mols/m5mols_core.c +@@ -488,7 +488,7 @@ static enum m5mols_restype __find_restype(u32 code) + do { + if (code == m5mols_default_ffmt[type].code) + return type; +- } while (type++ != SIZE_DEFAULT_FFMT); ++ } while (++type != SIZE_DEFAULT_FFMT); + + return 0; + } +diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c +index af85b32c6c1c8..c468f9a02ef6b 100644 +--- a/drivers/mmc/host/atmel-mci.c ++++ b/drivers/mmc/host/atmel-mci.c +@@ -1818,7 +1818,6 @@ static void atmci_tasklet_func(unsigned long priv) + atmci_writel(host, ATMCI_IER, ATMCI_NOTBUSY); + state = STATE_WAITING_NOTBUSY; + } else if (host->mrq->stop) { +- atmci_writel(host, ATMCI_IER, ATMCI_CMDRDY); + atmci_send_stop_cmd(host, data); + state = STATE_SENDING_STOP; + } else { +@@ -1851,8 +1850,6 @@ static void atmci_tasklet_func(unsigned long priv) + * command to send. + */ + if (host->mrq->stop) { +- atmci_writel(host, ATMCI_IER, +- ATMCI_CMDRDY); + atmci_send_stop_cmd(host, data); + state = STATE_SENDING_STOP; + } else { +diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c +index 24cd6d3dc6477..bf2592774165b 100644 +--- a/drivers/mmc/host/sdhci_am654.c ++++ b/drivers/mmc/host/sdhci_am654.c +@@ -369,7 +369,7 @@ static void sdhci_am654_write_b(struct sdhci_host *host, u8 val, int reg) + MAX_POWER_ON_TIMEOUT, false, host, val, + reg); + if (ret) +- dev_warn(mmc_dev(host->mmc), "Power on failed\n"); ++ dev_info(mmc_dev(host->mmc), "Power on failed\n"); + } + } + +diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c +index 371b345635e62..a253476a52b01 100644 +--- a/drivers/net/dsa/mv88e6xxx/chip.c ++++ b/drivers/net/dsa/mv88e6xxx/chip.c +@@ -2734,7 +2734,7 @@ static int mv88e6xxx_get_max_mtu(struct dsa_switch *ds, int port) + return 10240 - VLAN_ETH_HLEN - EDSA_HLEN - ETH_FCS_LEN; + else if (chip->info->ops->set_max_frame_size) + return 1632 - VLAN_ETH_HLEN - EDSA_HLEN - ETH_FCS_LEN; +- return 1522 - VLAN_ETH_HLEN - EDSA_HLEN - ETH_FCS_LEN; ++ return ETH_DATA_LEN; + } + + static int mv88e6xxx_change_mtu(struct dsa_switch *ds, int port, int new_mtu) +@@ -2742,6 +2742,17 @@ static int mv88e6xxx_change_mtu(struct dsa_switch *ds, int port, int new_mtu) + struct mv88e6xxx_chip *chip = ds->priv; + int ret = 0; + ++ /* For families where we don't know how to alter the MTU, ++ * just accept any value up to ETH_DATA_LEN ++ */ ++ if (!chip->info->ops->port_set_jumbo_size && ++ !chip->info->ops->set_max_frame_size) { ++ if (new_mtu > ETH_DATA_LEN) ++ return -EINVAL; ++ ++ return 0; ++ } ++ + if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port)) + new_mtu += EDSA_HLEN; + +@@ -2750,9 +2761,6 @@ static int mv88e6xxx_change_mtu(struct dsa_switch *ds, int port, int new_mtu) + ret = chip->info->ops->port_set_jumbo_size(chip, port, new_mtu); + else if (chip->info->ops->set_max_frame_size) + ret = chip->info->ops->set_max_frame_size(chip, new_mtu); +- else +- if (new_mtu > 1522) +- ret = -EINVAL; + mv88e6xxx_reg_unlock(chip); + + return ret; +diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c +index 9e8a20a94862f..76481ff7074ba 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_main.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c +@@ -14851,6 +14851,7 @@ static int i40e_init_recovery_mode(struct i40e_pf *pf, struct i40e_hw *hw) + int err; + int v_idx; + ++ pci_set_drvdata(pf->pdev, pf); + pci_save_state(pf->pdev); + + /* set up periodic task facility */ +diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c +index 59963b901be0f..e0790df700e2c 100644 +--- a/drivers/net/ethernet/intel/ice/ice_xsk.c ++++ b/drivers/net/ethernet/intel/ice/ice_xsk.c +@@ -169,8 +169,6 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) + } + netif_tx_stop_queue(netdev_get_tx_queue(vsi->netdev, q_idx)); + +- ice_qvec_dis_irq(vsi, rx_ring, q_vector); +- + ice_fill_txq_meta(vsi, tx_ring, &txq_meta); + err = ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, 0, tx_ring, &txq_meta); + if (err) +@@ -185,6 +183,8 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) + if (err) + return err; + } ++ ice_qvec_dis_irq(vsi, rx_ring, q_vector); ++ + err = ice_vsi_ctrl_one_rx_ring(vsi, false, q_idx, true); + if (err) + return err; +diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c +index d2f5855b2ea79..895b6f0a39841 100644 +--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c ++++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c +@@ -4986,6 +4986,11 @@ static int qed_init_wfq_param(struct qed_hwfn *p_hwfn, + + num_vports = p_hwfn->qm_info.num_vports; + ++ if (num_vports < 2) { ++ DP_NOTICE(p_hwfn, "Unexpected num_vports: %d\n", num_vports); ++ return -EINVAL; ++ } ++ + /* Accounting for the vports which are configured for WFQ explicitly */ + for (i = 0; i < num_vports; i++) { + u32 tmp_speed; +diff --git a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c +index 3e3192a3ad9b7..fdbd5f07a1857 100644 +--- a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c ++++ b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c +@@ -422,7 +422,7 @@ qed_mfw_get_tlv_time_value(struct qed_mfw_tlv_time *p_time, + if (p_time->hour > 23) + p_time->hour = 0; + if (p_time->min > 59) +- p_time->hour = 0; ++ p_time->min = 0; + if (p_time->msec > 999) + p_time->msec = 0; + if (p_time->usec > 999) +diff --git a/drivers/net/ethernet/sun/ldmvsw.c b/drivers/net/ethernet/sun/ldmvsw.c +index 01ea0d6f88193..934a4b54784b8 100644 +--- a/drivers/net/ethernet/sun/ldmvsw.c ++++ b/drivers/net/ethernet/sun/ldmvsw.c +@@ -290,6 +290,9 @@ static int vsw_port_probe(struct vio_dev *vdev, const struct vio_device_id *id) + + hp = mdesc_grab(); + ++ if (!hp) ++ return -ENODEV; ++ + rmac = mdesc_get_property(hp, vdev->mp, remote_macaddr_prop, &len); + err = -ENODEV; + if (!rmac) { +diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c +index 96b883f965f63..b6c03adf1e762 100644 +--- a/drivers/net/ethernet/sun/sunvnet.c ++++ b/drivers/net/ethernet/sun/sunvnet.c +@@ -431,6 +431,9 @@ static int vnet_port_probe(struct vio_dev *vdev, const struct vio_device_id *id) + + hp = mdesc_grab(); + ++ if (!hp) ++ return -ENODEV; ++ + vp = vnet_find_parent(hp, vdev->mp, vdev); + if (IS_ERR(vp)) { + pr_err("Cannot find port parent vnet\n"); +diff --git a/drivers/net/ipvlan/ipvlan_l3s.c b/drivers/net/ipvlan/ipvlan_l3s.c +index 943d26cbf39f5..71712ea25403d 100644 +--- a/drivers/net/ipvlan/ipvlan_l3s.c ++++ b/drivers/net/ipvlan/ipvlan_l3s.c +@@ -101,6 +101,7 @@ static unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb, + goto out; + + skb->dev = addr->master->dev; ++ skb->skb_iif = skb->dev->ifindex; + len = skb->len + ETH_HLEN; + ipvlan_count_rx(addr->master, len, true, false); + out: +diff --git a/drivers/net/phy/smsc.c b/drivers/net/phy/smsc.c +index caf7291ffaf83..b67de3f9ef186 100644 +--- a/drivers/net/phy/smsc.c ++++ b/drivers/net/phy/smsc.c +@@ -181,8 +181,11 @@ static int lan95xx_config_aneg_ext(struct phy_device *phydev) + static int lan87xx_read_status(struct phy_device *phydev) + { + struct smsc_phy_priv *priv = phydev->priv; ++ int err; + +- int err = genphy_read_status(phydev); ++ err = genphy_read_status(phydev); ++ if (err) ++ return err; + + if (!phydev->link && priv->energy_enable) { + /* Disable EDPD to wake up PHY */ +diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c +index 378a12ae2d957..fb1389bd09392 100644 +--- a/drivers/net/usb/smsc75xx.c ++++ b/drivers/net/usb/smsc75xx.c +@@ -2199,6 +2199,13 @@ static int smsc75xx_rx_fixup(struct usbnet *dev, struct sk_buff *skb) + size = (rx_cmd_a & RX_CMD_A_LEN) - RXW_PADDING; + align_count = (4 - ((size + RXW_PADDING) % 4)) % 4; + ++ if (unlikely(size > skb->len)) { ++ netif_dbg(dev, rx_err, dev->net, ++ "size err rx_cmd_a=0x%08x\n", ++ rx_cmd_a); ++ return 0; ++ } ++ + if (unlikely(rx_cmd_a & RX_CMD_A_RED)) { + netif_dbg(dev, rx_err, dev->net, + "Error rx_cmd_a=0x%08x\n", rx_cmd_a); +diff --git a/drivers/nfc/pn533/usb.c b/drivers/nfc/pn533/usb.c +index 57b07446bb768..68eb1253f888f 100644 +--- a/drivers/nfc/pn533/usb.c ++++ b/drivers/nfc/pn533/usb.c +@@ -175,6 +175,7 @@ static int pn533_usb_send_frame(struct pn533 *dev, + print_hex_dump_debug("PN533 TX: ", DUMP_PREFIX_NONE, 16, 1, + out->data, out->len, false); + ++ arg.phy = phy; + init_completion(&arg.done); + cntx = phy->out_urb->context; + phy->out_urb->context = &arg; +diff --git a/drivers/nfc/st-nci/ndlc.c b/drivers/nfc/st-nci/ndlc.c +index 5d74c674368a5..8ccf5a86ad1bb 100644 +--- a/drivers/nfc/st-nci/ndlc.c ++++ b/drivers/nfc/st-nci/ndlc.c +@@ -286,13 +286,15 @@ EXPORT_SYMBOL(ndlc_probe); + + void ndlc_remove(struct llt_ndlc *ndlc) + { +- st_nci_remove(ndlc->ndev); +- + /* cancel timers */ + del_timer_sync(&ndlc->t1_timer); + del_timer_sync(&ndlc->t2_timer); + ndlc->t2_active = false; + ndlc->t1_active = false; ++ /* cancel work */ ++ cancel_work_sync(&ndlc->sm_work); ++ ++ st_nci_remove(ndlc->ndev); + + skb_queue_purge(&ndlc->rcv_q); + skb_queue_purge(&ndlc->send_q); +diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c +index e162f1dfbafe9..a4b6aa932a8fe 100644 +--- a/drivers/nvme/host/core.c ++++ b/drivers/nvme/host/core.c +@@ -723,16 +723,26 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, + range = page_address(ns->ctrl->discard_page); + } + +- __rq_for_each_bio(bio, req) { +- u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector); +- u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; +- +- if (n < segments) { +- range[n].cattr = cpu_to_le32(0); +- range[n].nlb = cpu_to_le32(nlb); +- range[n].slba = cpu_to_le64(slba); ++ if (queue_max_discard_segments(req->q) == 1) { ++ u64 slba = nvme_sect_to_lba(ns, blk_rq_pos(req)); ++ u32 nlb = blk_rq_sectors(req) >> (ns->lba_shift - 9); ++ ++ range[0].cattr = cpu_to_le32(0); ++ range[0].nlb = cpu_to_le32(nlb); ++ range[0].slba = cpu_to_le64(slba); ++ n = 1; ++ } else { ++ __rq_for_each_bio(bio, req) { ++ u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector); ++ u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; ++ ++ if (n < segments) { ++ range[n].cattr = cpu_to_le32(0); ++ range[n].nlb = cpu_to_le32(nlb); ++ range[n].slba = cpu_to_le64(slba); ++ } ++ n++; + } +- n++; + } + + if (WARN_ON_ONCE(n != segments)) { +diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c +index bc88ff2912f56..a82a0796a6148 100644 +--- a/drivers/nvme/target/core.c ++++ b/drivers/nvme/target/core.c +@@ -749,8 +749,10 @@ static void __nvmet_req_complete(struct nvmet_req *req, u16 status) + + void nvmet_req_complete(struct nvmet_req *req, u16 status) + { ++ struct nvmet_sq *sq = req->sq; ++ + __nvmet_req_complete(req, status); +- percpu_ref_put(&req->sq->ref); ++ percpu_ref_put(&sq->ref); + } + EXPORT_SYMBOL_GPL(nvmet_req_complete); + +diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c +index 8b587fc97f7bc..c22cc20db1a74 100644 +--- a/drivers/pci/pci-driver.c ++++ b/drivers/pci/pci-driver.c +@@ -911,7 +911,7 @@ static int pci_pm_resume_noirq(struct device *dev) + pcie_pme_root_status_cleanup(pci_dev); + + if (!skip_bus_pm && prev_state == PCI_D3cold) +- pci_bridge_wait_for_secondary_bus(pci_dev); ++ pci_bridge_wait_for_secondary_bus(pci_dev, "resume", PCI_RESET_WAIT); + + if (pci_has_legacy_pm_support(pci_dev)) + return 0; +@@ -1298,7 +1298,7 @@ static int pci_pm_runtime_resume(struct device *dev) + pci_pm_default_resume(pci_dev); + + if (prev_state == PCI_D3cold) +- pci_bridge_wait_for_secondary_bus(pci_dev); ++ pci_bridge_wait_for_secondary_bus(pci_dev, "resume", PCI_RESET_WAIT); + + if (pm && pm->runtime_resume) + error = pm->runtime_resume(dev); +diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c +index 744a2e05635b9..d37013d007b6e 100644 +--- a/drivers/pci/pci.c ++++ b/drivers/pci/pci.c +@@ -157,9 +157,6 @@ static int __init pcie_port_pm_setup(char *str) + } + __setup("pcie_port_pm=", pcie_port_pm_setup); + +-/* Time to wait after a reset for device to become responsive */ +-#define PCIE_RESET_READY_POLL_MS 60000 +- + /** + * pci_bus_max_busnr - returns maximum PCI bus number of given bus' children + * @bus: pointer to PCI bus structure to search +@@ -1221,7 +1218,7 @@ static int pci_dev_wait(struct pci_dev *dev, char *reset_type, int timeout) + return -ENOTTY; + } + +- if (delay > 1000) ++ if (delay > PCI_RESET_WAIT) + pci_info(dev, "not ready %dms after %s; waiting\n", + delay - 1, reset_type); + +@@ -1230,7 +1227,7 @@ static int pci_dev_wait(struct pci_dev *dev, char *reset_type, int timeout) + pci_read_config_dword(dev, PCI_COMMAND, &id); + } + +- if (delay > 1000) ++ if (delay > PCI_RESET_WAIT) + pci_info(dev, "ready %dms after %s\n", delay - 1, + reset_type); + +@@ -4792,24 +4789,31 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus) + /** + * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible + * @dev: PCI bridge ++ * @reset_type: reset type in human-readable form ++ * @timeout: maximum time to wait for devices on secondary bus (milliseconds) + * + * Handle necessary delays before access to the devices on the secondary +- * side of the bridge are permitted after D3cold to D0 transition. ++ * side of the bridge are permitted after D3cold to D0 transition ++ * or Conventional Reset. + * + * For PCIe this means the delays in PCIe 5.0 section 6.6.1. For + * conventional PCI it means Tpvrh + Trhfa specified in PCI 3.0 section + * 4.3.2. ++ * ++ * Return 0 on success or -ENOTTY if the first device on the secondary bus ++ * failed to become accessible. + */ +-void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev) ++int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type, ++ int timeout) + { + struct pci_dev *child; + int delay; + + if (pci_dev_is_disconnected(dev)) +- return; ++ return 0; + + if (!pci_is_bridge(dev)) +- return; ++ return 0; + + down_read(&pci_bus_sem); + +@@ -4821,14 +4825,14 @@ void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev) + */ + if (!dev->subordinate || list_empty(&dev->subordinate->devices)) { + up_read(&pci_bus_sem); +- return; ++ return 0; + } + + /* Take d3cold_delay requirements into account */ + delay = pci_bus_max_d3cold_delay(dev->subordinate); + if (!delay) { + up_read(&pci_bus_sem); +- return; ++ return 0; + } + + child = list_first_entry(&dev->subordinate->devices, struct pci_dev, +@@ -4837,14 +4841,12 @@ void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev) + + /* + * Conventional PCI and PCI-X we need to wait Tpvrh + Trhfa before +- * accessing the device after reset (that is 1000 ms + 100 ms). In +- * practice this should not be needed because we don't do power +- * management for them (see pci_bridge_d3_possible()). ++ * accessing the device after reset (that is 1000 ms + 100 ms). + */ + if (!pci_is_pcie(dev)) { + pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay); + msleep(1000 + delay); +- return; ++ return 0; + } + + /* +@@ -4861,11 +4863,11 @@ void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev) + * configuration requests if we only wait for 100 ms (see + * https://bugzilla.kernel.org/show_bug.cgi?id=203885). + * +- * Therefore we wait for 100 ms and check for the device presence. +- * If it is still not present give it an additional 100 ms. ++ * Therefore we wait for 100 ms and check for the device presence ++ * until the timeout expires. + */ + if (!pcie_downstream_port(dev)) +- return; ++ return 0; + + if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) { + pci_dbg(dev, "waiting %d ms for downstream link\n", delay); +@@ -4876,14 +4878,11 @@ void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev) + if (!pcie_wait_for_link_delay(dev, true, delay)) { + /* Did not train, no need to wait any further */ + pci_info(dev, "Data Link Layer Link Active not set in 1000 msec\n"); +- return; ++ return -ENOTTY; + } + } + +- if (!pci_device_is_present(child)) { +- pci_dbg(child, "waiting additional %d ms to become accessible\n", delay); +- msleep(delay); +- } ++ return pci_dev_wait(child, reset_type, timeout - delay); + } + + void pci_reset_secondary_bus(struct pci_dev *dev) +@@ -4902,15 +4901,6 @@ void pci_reset_secondary_bus(struct pci_dev *dev) + + ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET; + pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl); +- +- /* +- * Trhfa for conventional PCI is 2^25 clock cycles. +- * Assuming a minimum 33MHz clock this results in a 1s +- * delay before we can consider subordinate devices to +- * be re-initialized. PCIe has some ways to shorten this, +- * but we don't make use of them yet. +- */ +- ssleep(1); + } + + void __weak pcibios_reset_secondary_bus(struct pci_dev *dev) +@@ -4929,7 +4919,8 @@ int pci_bridge_secondary_bus_reset(struct pci_dev *dev) + { + pcibios_reset_secondary_bus(dev); + +- return pci_dev_wait(dev, "bus reset", PCIE_RESET_READY_POLL_MS); ++ return pci_bridge_wait_for_secondary_bus(dev, "bus reset", ++ PCIE_RESET_READY_POLL_MS); + } + EXPORT_SYMBOL_GPL(pci_bridge_secondary_bus_reset); + +diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h +index 9197d7362731e..72436000ff252 100644 +--- a/drivers/pci/pci.h ++++ b/drivers/pci/pci.h +@@ -47,6 +47,19 @@ int pci_bus_error_reset(struct pci_dev *dev); + #define PCI_PM_D3HOT_WAIT 10 /* msec */ + #define PCI_PM_D3COLD_WAIT 100 /* msec */ + ++/* ++ * Following exit from Conventional Reset, devices must be ready within 1 sec ++ * (PCIe r6.0 sec 6.6.1). A D3cold to D0 transition implies a Conventional ++ * Reset (PCIe r6.0 sec 5.8). ++ */ ++#define PCI_RESET_WAIT 1000 /* msec */ ++/* ++ * Devices may extend the 1 sec period through Request Retry Status completions ++ * (PCIe r6.0 sec 2.3.1). The spec does not provide an upper limit, but 60 sec ++ * ought to be enough for any device to become responsive. ++ */ ++#define PCIE_RESET_READY_POLL_MS 60000 /* msec */ ++ + /** + * struct pci_platform_pm_ops - Firmware PM callbacks + * +@@ -108,7 +121,8 @@ void pci_allocate_cap_save_buffers(struct pci_dev *dev); + void pci_free_cap_save_buffers(struct pci_dev *dev); + bool pci_bridge_d3_possible(struct pci_dev *dev); + void pci_bridge_d3_update(struct pci_dev *dev); +-void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev); ++int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type, ++ int timeout); + + static inline void pci_wakeup_event(struct pci_dev *dev) + { +diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c +index c556e7beafe38..f21d64ae4ffcc 100644 +--- a/drivers/pci/pcie/dpc.c ++++ b/drivers/pci/pcie/dpc.c +@@ -170,8 +170,8 @@ pci_ers_result_t dpc_reset_link(struct pci_dev *pdev) + pci_write_config_word(pdev, cap + PCI_EXP_DPC_STATUS, + PCI_EXP_DPC_STATUS_TRIGGER); + +- if (!pcie_wait_for_link(pdev, true)) { +- pci_info(pdev, "Data Link Layer Link Active not set in 1000 msec\n"); ++ if (pci_bridge_wait_for_secondary_bus(pdev, "DPC", ++ PCIE_RESET_READY_POLL_MS)) { + clear_bit(PCI_DPC_RECOVERED, &pdev->priv_flags); + ret = PCI_ERS_RESULT_DISCONNECT; + } else { +diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c +index fae0323242103..18321cf9db5d6 100644 +--- a/drivers/scsi/hosts.c ++++ b/drivers/scsi/hosts.c +@@ -322,10 +322,7 @@ static void scsi_host_dev_release(struct device *dev) + struct Scsi_Host *shost = dev_to_shost(dev); + struct device *parent = dev->parent; + +- /* In case scsi_remove_host() has not been called. */ +- scsi_proc_hostdir_rm(shost->hostt); +- +- /* Wait for functions invoked through call_rcu(&shost->rcu, ...) */ ++ /* Wait for functions invoked through call_rcu(&scmd->rcu, ...) */ + rcu_barrier(); + + if (shost->tmf_work_q) +diff --git a/drivers/scsi/mpt3sas/mpt3sas_transport.c b/drivers/scsi/mpt3sas/mpt3sas_transport.c +index b58f4d9c296a3..326265fd7f91a 100644 +--- a/drivers/scsi/mpt3sas/mpt3sas_transport.c ++++ b/drivers/scsi/mpt3sas/mpt3sas_transport.c +@@ -670,7 +670,7 @@ mpt3sas_transport_port_add(struct MPT3SAS_ADAPTER *ioc, u16 handle, + goto out_fail; + } + port = sas_port_alloc_num(sas_node->parent_dev); +- if ((sas_port_add(port))) { ++ if (!port || (sas_port_add(port))) { + ioc_err(ioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + goto out_fail; +@@ -695,6 +695,12 @@ mpt3sas_transport_port_add(struct MPT3SAS_ADAPTER *ioc, u16 handle, + rphy = sas_expander_alloc(port, + mpt3sas_port->remote_identify.device_type); + ++ if (!rphy) { ++ ioc_err(ioc, "failure at %s:%d/%s()!\n", ++ __FILE__, __LINE__, __func__); ++ goto out_delete_port; ++ } ++ + rphy->identify = mpt3sas_port->remote_identify; + + if (mpt3sas_port->remote_identify.device_type == SAS_END_DEVICE) { +@@ -714,6 +720,7 @@ mpt3sas_transport_port_add(struct MPT3SAS_ADAPTER *ioc, u16 handle, + __FILE__, __LINE__, __func__); + sas_rphy_free(rphy); + rphy = NULL; ++ goto out_delete_port; + } + + if (mpt3sas_port->remote_identify.device_type == SAS_END_DEVICE) { +@@ -740,7 +747,10 @@ mpt3sas_transport_port_add(struct MPT3SAS_ADAPTER *ioc, u16 handle, + rphy_to_expander_device(rphy)); + return mpt3sas_port; + +- out_fail: ++out_delete_port: ++ sas_port_delete(port); ++ ++out_fail: + list_for_each_entry_safe(mpt3sas_phy, next, &mpt3sas_port->phy_list, + port_siblings) + list_del(&mpt3sas_phy->port_siblings); +diff --git a/drivers/tty/serial/8250/8250_em.c b/drivers/tty/serial/8250/8250_em.c +index f8e99995eee91..d94c3811a8f7a 100644 +--- a/drivers/tty/serial/8250/8250_em.c ++++ b/drivers/tty/serial/8250/8250_em.c +@@ -106,8 +106,8 @@ static int serial8250_em_probe(struct platform_device *pdev) + memset(&up, 0, sizeof(up)); + up.port.mapbase = regs->start; + up.port.irq = irq; +- up.port.type = PORT_UNKNOWN; +- up.port.flags = UPF_BOOT_AUTOCONF | UPF_FIXED_PORT | UPF_IOREMAP; ++ up.port.type = PORT_16750; ++ up.port.flags = UPF_FIXED_PORT | UPF_IOREMAP | UPF_FIXED_TYPE; + up.port.dev = &pdev->dev; + up.port.private_data = priv; + +diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c +index 9cb0e8673f826..32cce52800a73 100644 +--- a/drivers/tty/serial/fsl_lpuart.c ++++ b/drivers/tty/serial/fsl_lpuart.c +@@ -2159,9 +2159,15 @@ lpuart32_set_termios(struct uart_port *port, struct ktermios *termios, + /* update the per-port timeout */ + uart_update_timeout(port, termios->c_cflag, baud); + +- /* wait transmit engin complete */ +- lpuart32_write(&sport->port, 0, UARTMODIR); +- lpuart32_wait_bit_set(&sport->port, UARTSTAT, UARTSTAT_TC); ++ /* ++ * LPUART Transmission Complete Flag may never be set while queuing a break ++ * character, so skip waiting for transmission complete when UARTCTRL_SBK is ++ * asserted. ++ */ ++ if (!(old_ctrl & UARTCTRL_SBK)) { ++ lpuart32_write(&sport->port, 0, UARTMODIR); ++ lpuart32_wait_bit_set(&sport->port, UARTSTAT, UARTSTAT_TC); ++ } + + /* disable transmit and receive */ + lpuart32_write(&sport->port, old_ctrl & ~(UARTCTRL_TE | UARTCTRL_RE), +diff --git a/drivers/video/fbdev/stifb.c b/drivers/video/fbdev/stifb.c +index 3feb6e40d56d8..ef8a4c5fc6875 100644 +--- a/drivers/video/fbdev/stifb.c ++++ b/drivers/video/fbdev/stifb.c +@@ -921,6 +921,28 @@ SETUP_HCRX(struct stifb_info *fb) + + /* ------------------- driver specific functions --------------------------- */ + ++static int ++stifb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) ++{ ++ struct stifb_info *fb = container_of(info, struct stifb_info, info); ++ ++ if (var->xres != fb->info.var.xres || ++ var->yres != fb->info.var.yres || ++ var->bits_per_pixel != fb->info.var.bits_per_pixel) ++ return -EINVAL; ++ ++ var->xres_virtual = var->xres; ++ var->yres_virtual = var->yres; ++ var->xoffset = 0; ++ var->yoffset = 0; ++ var->grayscale = fb->info.var.grayscale; ++ var->red.length = fb->info.var.red.length; ++ var->green.length = fb->info.var.green.length; ++ var->blue.length = fb->info.var.blue.length; ++ ++ return 0; ++} ++ + static int + stifb_setcolreg(u_int regno, u_int red, u_int green, + u_int blue, u_int transp, struct fb_info *info) +@@ -1145,6 +1167,7 @@ stifb_init_display(struct stifb_info *fb) + + static const struct fb_ops stifb_ops = { + .owner = THIS_MODULE, ++ .fb_check_var = stifb_check_var, + .fb_setcolreg = stifb_setcolreg, + .fb_blank = stifb_blank, + .fb_fillrect = stifb_fillrect, +@@ -1164,6 +1187,7 @@ static int __init stifb_init_fb(struct sti_struct *sti, int bpp_pref) + struct stifb_info *fb; + struct fb_info *info; + unsigned long sti_rom_address; ++ char modestr[32]; + char *dev_name; + int bpp, xres, yres; + +@@ -1342,6 +1366,9 @@ static int __init stifb_init_fb(struct sti_struct *sti, int bpp_pref) + info->flags = FBINFO_HWACCEL_COPYAREA | FBINFO_HWACCEL_FILLRECT; + info->pseudo_palette = &fb->pseudo_palette; + ++ scnprintf(modestr, sizeof(modestr), "%dx%d-%d", xres, yres, bpp); ++ fb_find_mode(&info->var, info, modestr, NULL, 0, NULL, bpp); ++ + /* This has to be done !!! */ + if (fb_alloc_cmap(&info->cmap, NR_PALETTE, 0)) + goto out_err1; +diff --git a/fs/attr.c b/fs/attr.c +index 848ffe6e3c24b..326a0db3296d7 100644 +--- a/fs/attr.c ++++ b/fs/attr.c +@@ -18,6 +18,65 @@ + #include <linux/evm.h> + #include <linux/ima.h> + ++#include "internal.h" ++ ++/** ++ * setattr_should_drop_sgid - determine whether the setgid bit needs to be ++ * removed ++ * @inode: inode to check ++ * ++ * This function determines whether the setgid bit needs to be removed. ++ * We retain backwards compatibility and require setgid bit to be removed ++ * unconditionally if S_IXGRP is set. Otherwise we have the exact same ++ * requirements as setattr_prepare() and setattr_copy(). ++ * ++ * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise. ++ */ ++int setattr_should_drop_sgid(const struct inode *inode) ++{ ++ umode_t mode = inode->i_mode; ++ ++ if (!(mode & S_ISGID)) ++ return 0; ++ if (mode & S_IXGRP) ++ return ATTR_KILL_SGID; ++ if (!in_group_or_capable(inode, inode->i_gid)) ++ return ATTR_KILL_SGID; ++ return 0; ++} ++ ++/** ++ * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to ++ * be dropped ++ * @inode: inode to check ++ * ++ * This function determines whether the set{g,u}id bits need to be removed. ++ * If the setuid bit needs to be removed ATTR_KILL_SUID is returned. If the ++ * setgid bit needs to be removed ATTR_KILL_SGID is returned. If both ++ * set{g,u}id bits need to be removed the corresponding mask of both flags is ++ * returned. ++ * ++ * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits ++ * to remove, 0 otherwise. ++ */ ++int setattr_should_drop_suidgid(struct inode *inode) ++{ ++ umode_t mode = inode->i_mode; ++ int kill = 0; ++ ++ /* suid always must be killed */ ++ if (unlikely(mode & S_ISUID)) ++ kill = ATTR_KILL_SUID; ++ ++ kill |= setattr_should_drop_sgid(inode); ++ ++ if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) ++ return kill; ++ ++ return 0; ++} ++EXPORT_SYMBOL(setattr_should_drop_suidgid); ++ + static bool chown_ok(const struct inode *inode, kuid_t uid) + { + if (uid_eq(current_fsuid(), inode->i_uid) && +@@ -90,9 +149,8 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr) + if (!inode_owner_or_capable(inode)) + return -EPERM; + /* Also check the setgid bit! */ +- if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : +- inode->i_gid) && +- !capable_wrt_inode_uidgid(inode, CAP_FSETID)) ++ if (!in_group_or_capable(inode, (ia_valid & ATTR_GID) ? ++ attr->ia_gid : inode->i_gid)) + attr->ia_mode &= ~S_ISGID; + } + +@@ -193,9 +251,7 @@ void setattr_copy(struct inode *inode, const struct iattr *attr) + inode->i_ctime = attr->ia_ctime; + if (ia_valid & ATTR_MODE) { + umode_t mode = attr->ia_mode; +- +- if (!in_group_p(inode->i_gid) && +- !capable_wrt_inode_uidgid(inode, CAP_FSETID)) ++ if (!in_group_or_capable(inode, inode->i_gid)) + mode &= ~S_ISGID; + inode->i_mode = mode; + } +@@ -297,7 +353,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de + } + } + if (ia_valid & ATTR_KILL_SGID) { +- if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { ++ if (mode & S_ISGID) { + if (!(ia_valid & ATTR_MODE)) { + ia_valid = attr->ia_valid |= ATTR_MODE; + attr->ia_mode = inode->i_mode; +diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c +index 97cd4df040608..e11818801148a 100644 +--- a/fs/cifs/smb2inode.c ++++ b/fs/cifs/smb2inode.c +@@ -236,15 +236,32 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, + size[0] = 8; /* sizeof __le64 */ + data[0] = ptr; + +- rc = SMB2_set_info_init(tcon, server, +- &rqst[num_rqst], COMPOUND_FID, +- COMPOUND_FID, current->tgid, +- FILE_END_OF_FILE_INFORMATION, +- SMB2_O_INFO_FILE, 0, data, size); ++ if (cfile) { ++ rc = SMB2_set_info_init(tcon, server, ++ &rqst[num_rqst], ++ cfile->fid.persistent_fid, ++ cfile->fid.volatile_fid, ++ current->tgid, ++ FILE_END_OF_FILE_INFORMATION, ++ SMB2_O_INFO_FILE, 0, ++ data, size); ++ } else { ++ rc = SMB2_set_info_init(tcon, server, ++ &rqst[num_rqst], ++ COMPOUND_FID, ++ COMPOUND_FID, ++ current->tgid, ++ FILE_END_OF_FILE_INFORMATION, ++ SMB2_O_INFO_FILE, 0, ++ data, size); ++ if (!rc) { ++ smb2_set_next_command(tcon, &rqst[num_rqst]); ++ smb2_set_related(&rqst[num_rqst]); ++ } ++ } + if (rc) + goto finished; +- smb2_set_next_command(tcon, &rqst[num_rqst]); +- smb2_set_related(&rqst[num_rqst++]); ++ num_rqst++; + trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path); + break; + case SMB2_OP_SET_INFO: +diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c +index b137006f0fd25..4409f56fc37e6 100644 +--- a/fs/cifs/transport.c ++++ b/fs/cifs/transport.c +@@ -312,7 +312,7 @@ static int + __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, + struct smb_rqst *rqst) + { +- int rc = 0; ++ int rc; + struct kvec *iov; + int n_vec; + unsigned int send_length = 0; +@@ -323,6 +323,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, + struct msghdr smb_msg = {}; + __be32 rfc1002_marker; + ++ cifs_in_send_inc(server); + if (cifs_rdma_enabled(server)) { + /* return -EAGAIN when connecting or reconnecting */ + rc = -EAGAIN; +@@ -331,14 +332,17 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, + goto smbd_done; + } + ++ rc = -EAGAIN; + if (ssocket == NULL) +- return -EAGAIN; ++ goto out; + ++ rc = -ERESTARTSYS; + if (fatal_signal_pending(current)) { + cifs_dbg(FYI, "signal pending before send request\n"); +- return -ERESTARTSYS; ++ goto out; + } + ++ rc = 0; + /* cork the socket */ + tcp_sock_set_cork(ssocket->sk, true); + +@@ -449,7 +453,8 @@ smbd_done: + rc); + else if (rc > 0) + rc = 0; +- ++out: ++ cifs_in_send_dec(server); + return rc; + } + +@@ -826,9 +831,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, + * I/O response may come back and free the mid entry on another thread. + */ + cifs_save_when_sent(mid); +- cifs_in_send_inc(server); + rc = smb_send_rqst(server, 1, rqst, flags); +- cifs_in_send_dec(server); + + if (rc < 0) { + revert_current_mid(server, mid->credits); +@@ -1117,9 +1120,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, + else + midQ[i]->callback = cifs_compound_last_callback; + } +- cifs_in_send_inc(server); + rc = smb_send_rqst(server, num_rqst, rqst, flags); +- cifs_in_send_dec(server); + + for (i = 0; i < num_rqst; i++) + cifs_save_when_sent(midQ[i]); +@@ -1356,9 +1357,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, + + midQ->mid_state = MID_REQUEST_SUBMITTED; + +- cifs_in_send_inc(server); + rc = smb_send(server, in_buf, len); +- cifs_in_send_dec(server); + cifs_save_when_sent(midQ); + + if (rc < 0) +@@ -1495,9 +1494,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, + } + + midQ->mid_state = MID_REQUEST_SUBMITTED; +- cifs_in_send_inc(server); + rc = smb_send(server, in_buf, len); +- cifs_in_send_dec(server); + cifs_save_when_sent(midQ); + + if (rc < 0) +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 1a654a1f3f46b..6ba185b46ba39 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -4721,13 +4721,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + goto bad_inode; + raw_inode = ext4_raw_inode(&iloc); + +- if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) { +- ext4_error_inode(inode, function, line, 0, +- "iget: root inode unallocated"); +- ret = -EFSCORRUPTED; +- goto bad_inode; +- } +- + if ((flags & EXT4_IGET_HANDLE) && + (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { + ret = -ESTALE; +@@ -4800,11 +4793,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + * NeilBrown 1999oct15 + */ + if (inode->i_nlink == 0) { +- if ((inode->i_mode == 0 || ++ if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL || + !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && + ino != EXT4_BOOT_LOADER_INO) { +- /* this inode is deleted */ +- ret = -ESTALE; ++ /* this inode is deleted or unallocated */ ++ if (flags & EXT4_IGET_SPECIAL) { ++ ext4_error_inode(inode, function, line, 0, ++ "iget: special inode unallocated"); ++ ret = -EFSCORRUPTED; ++ } else ++ ret = -ESTALE; + goto bad_inode; + } + /* The only unlinked inodes we let through here have +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index 1f47aeca71422..45f719c1e0023 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -3934,10 +3934,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, + goto end_rename; + } + retval = ext4_rename_dir_prepare(handle, &old); +- if (retval) { +- inode_unlock(old.inode); ++ if (retval) + goto end_rename; +- } + } + /* + * If we're renaming a file within an inline_data dir and adding or +diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c +index 60e122761352c..f3da1f2d4cb93 100644 +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -386,6 +386,17 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, + struct inode *inode; + int err; + ++ /* ++ * We have to check for this corruption early as otherwise ++ * iget_locked() could wait indefinitely for the state of our ++ * parent inode. ++ */ ++ if (parent->i_ino == ea_ino) { ++ ext4_error(parent->i_sb, ++ "Parent and EA inode have the same ino %lu", ea_ino); ++ return -EFSCORRUPTED; ++ } ++ + inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); +diff --git a/fs/inode.c b/fs/inode.c +index 9f49e0bdc2f77..7ec90788d8be9 100644 +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -1854,35 +1854,6 @@ skip_update: + } + EXPORT_SYMBOL(touch_atime); + +-/* +- * The logic we want is +- * +- * if suid or (sgid and xgrp) +- * remove privs +- */ +-int should_remove_suid(struct dentry *dentry) +-{ +- umode_t mode = d_inode(dentry)->i_mode; +- int kill = 0; +- +- /* suid always must be killed */ +- if (unlikely(mode & S_ISUID)) +- kill = ATTR_KILL_SUID; +- +- /* +- * sgid without any exec bits is just a mandatory locking mark; leave +- * it alone. If some exec bits are set, it's a real sgid; kill it. +- */ +- if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) +- kill |= ATTR_KILL_SGID; +- +- if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) +- return kill; +- +- return 0; +-} +-EXPORT_SYMBOL(should_remove_suid); +- + /* + * Return mask of changes for notify_change() that need to be done as a + * response to write or truncate. Return 0 if nothing has to be changed. +@@ -1897,7 +1868,7 @@ int dentry_needs_remove_privs(struct dentry *dentry) + if (IS_NOSEC(inode)) + return 0; + +- mask = should_remove_suid(dentry); ++ mask = setattr_should_drop_suidgid(inode); + ret = security_inode_need_killpriv(dentry); + if (ret < 0) + return ret; +@@ -2147,10 +2118,6 @@ void inode_init_owner(struct inode *inode, const struct inode *dir, + /* Directories are special, and always inherit S_ISGID */ + if (S_ISDIR(mode)) + mode |= S_ISGID; +- else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) && +- !in_group_p(inode->i_gid) && +- !capable_wrt_inode_uidgid(dir, CAP_FSETID)) +- mode &= ~S_ISGID; + } else + inode->i_gid = current_fsgid(); + inode->i_mode = mode; +@@ -2382,3 +2349,48 @@ int vfs_ioc_fssetxattr_check(struct inode *inode, const struct fsxattr *old_fa, + return 0; + } + EXPORT_SYMBOL(vfs_ioc_fssetxattr_check); ++ ++/** ++ * in_group_or_capable - check whether caller is CAP_FSETID privileged ++ * @inode: inode to check ++ * @gid: the new/current gid of @inode ++ * ++ * Check wether @gid is in the caller's group list or if the caller is ++ * privileged with CAP_FSETID over @inode. This can be used to determine ++ * whether the setgid bit can be kept or must be dropped. ++ * ++ * Return: true if the caller is sufficiently privileged, false if not. ++ */ ++bool in_group_or_capable(const struct inode *inode, kgid_t gid) ++{ ++ if (in_group_p(gid)) ++ return true; ++ if (capable_wrt_inode_uidgid(inode, CAP_FSETID)) ++ return true; ++ return false; ++} ++ ++/** ++ * mode_strip_sgid - handle the sgid bit for non-directories ++ * @dir: parent directory inode ++ * @mode: mode of the file to be created in @dir ++ * ++ * If the @mode of the new file has both the S_ISGID and S_IXGRP bit ++ * raised and @dir has the S_ISGID bit raised ensure that the caller is ++ * either in the group of the parent directory or they have CAP_FSETID ++ * in their user namespace and are privileged over the parent directory. ++ * In all other cases, strip the S_ISGID bit from @mode. ++ * ++ * Return: the new mode to use for the file ++ */ ++umode_t mode_strip_sgid(const struct inode *dir, umode_t mode) ++{ ++ if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP)) ++ return mode; ++ if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID)) ++ return mode; ++ if (in_group_or_capable(dir, dir->i_gid)) ++ return mode; ++ return mode & ~S_ISGID; ++} ++EXPORT_SYMBOL(mode_strip_sgid); +diff --git a/fs/internal.h b/fs/internal.h +index 06d313b9beecb..d5d9fcdae10c4 100644 +--- a/fs/internal.h ++++ b/fs/internal.h +@@ -149,6 +149,7 @@ extern int vfs_open(const struct path *, struct file *); + extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); + extern void inode_add_lru(struct inode *inode); + extern int dentry_needs_remove_privs(struct dentry *dentry); ++bool in_group_or_capable(const struct inode *inode, kgid_t gid); + + /* + * fs-writeback.c +@@ -196,3 +197,8 @@ int sb_init_dio_done_wq(struct super_block *sb); + */ + int do_statx(int dfd, const char __user *filename, unsigned flags, + unsigned int mask, struct statx __user *buffer); ++ ++/* ++ * fs/attr.c ++ */ ++int setattr_should_drop_sgid(const struct inode *inode); +diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c +index bd7d58d27bfc6..97a3c09fd96b6 100644 +--- a/fs/jffs2/file.c ++++ b/fs/jffs2/file.c +@@ -138,19 +138,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + pgoff_t index = pos >> PAGE_SHIFT; +- uint32_t pageofs = index << PAGE_SHIFT; + int ret = 0; + + jffs2_dbg(1, "%s()\n", __func__); + +- if (pageofs > inode->i_size) { +- /* Make new hole frag from old EOF to new page */ ++ if (pos > inode->i_size) { ++ /* Make new hole frag from old EOF to new position */ + struct jffs2_raw_inode ri; + struct jffs2_full_dnode *fn; + uint32_t alloc_len; + +- jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n", +- (unsigned int)inode->i_size, pageofs); ++ jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new position\n", ++ (unsigned int)inode->i_size, (uint32_t)pos); + + ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, + ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); +@@ -170,10 +169,10 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, + ri.mode = cpu_to_jemode(inode->i_mode); + ri.uid = cpu_to_je16(i_uid_read(inode)); + ri.gid = cpu_to_je16(i_gid_read(inode)); +- ri.isize = cpu_to_je32(max((uint32_t)inode->i_size, pageofs)); ++ ri.isize = cpu_to_je32((uint32_t)pos); + ri.atime = ri.ctime = ri.mtime = cpu_to_je32(JFFS2_NOW()); + ri.offset = cpu_to_je32(inode->i_size); +- ri.dsize = cpu_to_je32(pageofs - inode->i_size); ++ ri.dsize = cpu_to_je32((uint32_t)pos - inode->i_size); + ri.csize = cpu_to_je32(0); + ri.compr = JFFS2_COMPR_ZERO; + ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8)); +@@ -203,7 +202,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, + goto out_err; + } + jffs2_complete_reservation(c); +- inode->i_size = pageofs; ++ inode->i_size = pos; + mutex_unlock(&f->sem); + } + +diff --git a/fs/namei.c b/fs/namei.c +index 4159c140fa473..3d98db9802a77 100644 +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -2798,6 +2798,63 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) + } + EXPORT_SYMBOL(unlock_rename); + ++/** ++ * mode_strip_umask - handle vfs umask stripping ++ * @dir: parent directory of the new inode ++ * @mode: mode of the new inode to be created in @dir ++ * ++ * Umask stripping depends on whether or not the filesystem supports POSIX ++ * ACLs. If the filesystem doesn't support it umask stripping is done directly ++ * in here. If the filesystem does support POSIX ACLs umask stripping is ++ * deferred until the filesystem calls posix_acl_create(). ++ * ++ * Returns: mode ++ */ ++static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode) ++{ ++ if (!IS_POSIXACL(dir)) ++ mode &= ~current_umask(); ++ return mode; ++} ++ ++/** ++ * vfs_prepare_mode - prepare the mode to be used for a new inode ++ * @dir: parent directory of the new inode ++ * @mode: mode of the new inode ++ * @mask_perms: allowed permission by the vfs ++ * @type: type of file to be created ++ * ++ * This helper consolidates and enforces vfs restrictions on the @mode of a new ++ * object to be created. ++ * ++ * Umask stripping depends on whether the filesystem supports POSIX ACLs (see ++ * the kernel documentation for mode_strip_umask()). Moving umask stripping ++ * after setgid stripping allows the same ordering for both non-POSIX ACL and ++ * POSIX ACL supporting filesystems. ++ * ++ * Note that it's currently valid for @type to be 0 if a directory is created. ++ * Filesystems raise that flag individually and we need to check whether each ++ * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a ++ * non-zero type. ++ * ++ * Returns: mode to be passed to the filesystem ++ */ ++static inline umode_t vfs_prepare_mode(const struct inode *dir, umode_t mode, ++ umode_t mask_perms, umode_t type) ++{ ++ mode = mode_strip_sgid(dir, mode); ++ mode = mode_strip_umask(dir, mode); ++ ++ /* ++ * Apply the vfs mandated allowed permission mask and set the type of ++ * file to be created before we call into the filesystem. ++ */ ++ mode &= (mask_perms & ~S_IFMT); ++ mode |= (type & S_IFMT); ++ ++ return mode; ++} ++ + int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool want_excl) + { +@@ -2807,8 +2864,8 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + + if (!dir->i_op->create) + return -EACCES; /* shouldn't it be ENOSYS? */ +- mode &= S_IALLUGO; +- mode |= S_IFREG; ++ ++ mode = vfs_prepare_mode(dir, mode, S_IALLUGO, S_IFREG); + error = security_inode_create(dir, dentry, mode); + if (error) + return error; +@@ -3072,8 +3129,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, + if (open_flag & O_CREAT) { + if (open_flag & O_EXCL) + open_flag &= ~O_TRUNC; +- if (!IS_POSIXACL(dir->d_inode)) +- mode &= ~current_umask(); ++ mode = vfs_prepare_mode(dir->d_inode, mode, mode, mode); + if (likely(got_write)) + create_error = may_o_create(&nd->path, dentry, mode); + else +@@ -3286,8 +3342,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag) + child = d_alloc(dentry, &slash_name); + if (unlikely(!child)) + goto out_err; +- if (!IS_POSIXACL(dir)) +- mode &= ~current_umask(); ++ mode = vfs_prepare_mode(dir, mode, mode, mode); + error = dir->i_op->tmpfile(dir, child, mode); + if (error) + goto out_err; +@@ -3548,6 +3603,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) + if (!dir->i_op->mknod) + return -EPERM; + ++ mode = vfs_prepare_mode(dir, mode, mode, mode); + error = devcgroup_inode_mknod(mode, dev); + if (error) + return error; +@@ -3596,9 +3652,8 @@ retry: + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + +- if (!IS_POSIXACL(path.dentry->d_inode)) +- mode &= ~current_umask(); +- error = security_path_mknod(&path, dentry, mode, dev); ++ error = security_path_mknod(&path, dentry, ++ mode_strip_umask(path.dentry->d_inode, mode), dev); + if (error) + goto out; + switch (mode & S_IFMT) { +@@ -3646,7 +3701,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) + if (!dir->i_op->mkdir) + return -EPERM; + +- mode &= (S_IRWXUGO|S_ISVTX); ++ mode = vfs_prepare_mode(dir, mode, S_IRWXUGO | S_ISVTX, 0); + error = security_inode_mkdir(dir, dentry, mode); + if (error) + return error; +@@ -3673,9 +3728,8 @@ retry: + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + +- if (!IS_POSIXACL(path.dentry->d_inode)) +- mode &= ~current_umask(); +- error = security_path_mkdir(&path, dentry, mode); ++ error = security_path_mkdir(&path, dentry, ++ mode_strip_umask(path.dentry->d_inode, mode)); + if (!error) + error = vfs_mkdir(path.dentry->d_inode, dentry, mode); + done_path_create(&path, dentry); +diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c +index 1470b49adb2db..ca00cac5a12f7 100644 +--- a/fs/ocfs2/file.c ++++ b/fs/ocfs2/file.c +@@ -1994,7 +1994,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, + } + } + +- if (file && should_remove_suid(file->f_path.dentry)) { ++ if (file && setattr_should_drop_suidgid(file_inode(file))) { + ret = __ocfs2_write_remove_suid(inode, di_bh); + if (ret) { + mlog_errno(ret); +@@ -2282,7 +2282,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file, + * inode. There's also the dinode i_size state which + * can be lost via setattr during extending writes (we + * set inode->i_size at the end of a write. */ +- if (should_remove_suid(dentry)) { ++ if (setattr_should_drop_suidgid(inode)) { + if (meta_level == 0) { + ocfs2_inode_unlock_for_extent_tree(inode, + &di_bh, +diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c +index 856474b0a1ae7..df1f6b7aa7979 100644 +--- a/fs/ocfs2/namei.c ++++ b/fs/ocfs2/namei.c +@@ -198,6 +198,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) + * callers. */ + if (S_ISDIR(mode)) + set_nlink(inode, 2); ++ mode = mode_strip_sgid(dir, mode); + inode_init_owner(inode, dir, mode); + status = dquot_initialize(inode); + if (status) +diff --git a/fs/open.c b/fs/open.c +index b3fbb4300fc96..1ca4b236fdbe0 100644 +--- a/fs/open.c ++++ b/fs/open.c +@@ -665,10 +665,10 @@ retry_deleg: + newattrs.ia_valid |= ATTR_GID; + newattrs.ia_gid = gid; + } +- if (!S_ISDIR(inode->i_mode)) +- newattrs.ia_valid |= +- ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; + inode_lock(inode); ++ if (!S_ISDIR(inode->i_mode)) ++ newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | ++ setattr_should_drop_sgid(inode); + error = security_path_chown(path, uid, gid); + if (!error) + error = notify_change(path->dentry, &newattrs, &delegated_inode); +diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c +index 24c7d30e41dfe..0926363179a76 100644 +--- a/fs/xfs/libxfs/xfs_btree.c ++++ b/fs/xfs/libxfs/xfs_btree.c +@@ -3190,7 +3190,7 @@ xfs_btree_insrec( + struct xfs_btree_block *block; /* btree block */ + struct xfs_buf *bp; /* buffer for block */ + union xfs_btree_ptr nptr; /* new block ptr */ +- struct xfs_btree_cur *ncur; /* new btree cursor */ ++ struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ + union xfs_btree_key nkey; /* new block key */ + union xfs_btree_key *lkey; + int optr; /* old key/record index */ +@@ -3270,7 +3270,7 @@ xfs_btree_insrec( + #ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) +- return error; ++ goto error0; + #endif + + /* +@@ -3290,7 +3290,7 @@ xfs_btree_insrec( + for (i = numrecs - ptr; i >= 0; i--) { + error = xfs_btree_debug_check_ptr(cur, pp, i, level); + if (error) +- return error; ++ goto error0; + } + + xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); +@@ -3375,6 +3375,8 @@ xfs_btree_insrec( + return 0; + + error0: ++ if (ncur) ++ xfs_btree_del_cursor(ncur, error); + return error; + } + +diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c +index 7371a7f7c6529..fbab1042bc90b 100644 +--- a/fs/xfs/xfs_bmap_util.c ++++ b/fs/xfs/xfs_bmap_util.c +@@ -800,9 +800,6 @@ xfs_alloc_file_space( + quota_flag = XFS_QMOPT_RES_REGBLKS; + } + +- /* +- * Allocate and setup the transaction. +- */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, + resrtextents, 0, &tp); + +@@ -830,9 +827,9 @@ xfs_alloc_file_space( + if (error) + goto error0; + +- /* +- * Complete the transaction +- */ ++ ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; ++ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); ++ + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) +diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c +index 4d6bf8d4974fe..9b6c5ba5fdfb6 100644 +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -94,8 +94,6 @@ xfs_update_prealloc_flags( + ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +- if (flags & XFS_PREALLOC_SYNC) +- xfs_trans_set_sync(tp); + return xfs_trans_commit(tp); + } + +@@ -852,7 +850,6 @@ xfs_file_fallocate( + struct inode *inode = file_inode(file); + struct xfs_inode *ip = XFS_I(inode); + long error; +- enum xfs_prealloc_flags flags = 0; + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + loff_t new_size = 0; + bool do_file_insert = false; +@@ -897,6 +894,10 @@ xfs_file_fallocate( + goto out_unlock; + } + ++ error = file_modified(file); ++ if (error) ++ goto out_unlock; ++ + if (mode & FALLOC_FL_PUNCH_HOLE) { + error = xfs_free_file_space(ip, offset, len); + if (error) +@@ -946,8 +947,6 @@ xfs_file_fallocate( + } + do_file_insert = true; + } else { +- flags |= XFS_PREALLOC_SET; +- + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; +@@ -1000,13 +999,6 @@ xfs_file_fallocate( + } + } + +- if (file->f_flags & O_DSYNC) +- flags |= XFS_PREALLOC_SYNC; +- +- error = xfs_update_prealloc_flags(ip, flags); +- if (error) +- goto out_unlock; +- + /* Change file size if needed */ + if (new_size) { + struct iattr iattr; +@@ -1024,8 +1016,14 @@ xfs_file_fallocate( + * leave shifted extents past EOF and hence losing access to + * the data that is contained within them. + */ +- if (do_file_insert) ++ if (do_file_insert) { + error = xfs_insert_file_space(ip, offset, len); ++ if (error) ++ goto out_unlock; ++ } ++ ++ if (file->f_flags & O_DSYNC) ++ error = xfs_log_force_inode(ip); + + out_unlock: + xfs_iunlock(ip, iolock); +diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c +index 6a3026e78a9bb..69fef29df4284 100644 +--- a/fs/xfs/xfs_iops.c ++++ b/fs/xfs/xfs_iops.c +@@ -595,37 +595,6 @@ xfs_vn_getattr( + return 0; + } + +-static void +-xfs_setattr_mode( +- struct xfs_inode *ip, +- struct iattr *iattr) +-{ +- struct inode *inode = VFS_I(ip); +- umode_t mode = iattr->ia_mode; +- +- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); +- +- inode->i_mode &= S_IFMT; +- inode->i_mode |= mode & ~S_IFMT; +-} +- +-void +-xfs_setattr_time( +- struct xfs_inode *ip, +- struct iattr *iattr) +-{ +- struct inode *inode = VFS_I(ip); +- +- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); +- +- if (iattr->ia_valid & ATTR_ATIME) +- inode->i_atime = iattr->ia_atime; +- if (iattr->ia_valid & ATTR_CTIME) +- inode->i_ctime = iattr->ia_ctime; +- if (iattr->ia_valid & ATTR_MTIME) +- inode->i_mtime = iattr->ia_mtime; +-} +- + static int + xfs_vn_change_ok( + struct dentry *dentry, +@@ -740,16 +709,6 @@ xfs_setattr_nonsize( + goto out_cancel; + } + +- /* +- * CAP_FSETID overrides the following restrictions: +- * +- * The set-user-ID and set-group-ID bits of a file will be +- * cleared upon successful return from chown() +- */ +- if ((inode->i_mode & (S_ISUID|S_ISGID)) && +- !capable(CAP_FSETID)) +- inode->i_mode &= ~(S_ISUID|S_ISGID); +- + /* + * Change the ownerships and register quota modifications + * in the transaction. +@@ -761,7 +720,6 @@ xfs_setattr_nonsize( + olddquot1 = xfs_qm_vop_chown(tp, ip, + &ip->i_udquot, udqp); + } +- inode->i_uid = uid; + } + if (!gid_eq(igid, gid)) { + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { +@@ -772,15 +730,10 @@ xfs_setattr_nonsize( + olddquot2 = xfs_qm_vop_chown(tp, ip, + &ip->i_gdquot, gdqp); + } +- inode->i_gid = gid; + } + } + +- if (mask & ATTR_MODE) +- xfs_setattr_mode(ip, iattr); +- if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) +- xfs_setattr_time(ip, iattr); +- ++ setattr_copy(inode, iattr); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + XFS_STATS_INC(mp, xs_ig_attrchg); +@@ -1025,11 +978,8 @@ xfs_setattr_size( + xfs_inode_clear_eofblocks_tag(ip); + } + +- if (iattr->ia_valid & ATTR_MODE) +- xfs_setattr_mode(ip, iattr); +- if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) +- xfs_setattr_time(ip, iattr); +- ++ ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); ++ setattr_copy(inode, iattr); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + XFS_STATS_INC(mp, xs_ig_attrchg); +diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h +index 4d24ff309f593..dd1bd0332f8e3 100644 +--- a/fs/xfs/xfs_iops.h ++++ b/fs/xfs/xfs_iops.h +@@ -18,7 +18,6 @@ extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); + */ + #define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */ + +-extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr); + extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, + int flags); + extern int xfs_vn_setattr_nonsize(struct dentry *dentry, struct iattr *vap); +diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c +index a2a5a0fd92334..402cf828cc919 100644 +--- a/fs/xfs/xfs_mount.c ++++ b/fs/xfs/xfs_mount.c +@@ -126,7 +126,6 @@ __xfs_free_perag( + { + struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); + +- ASSERT(atomic_read(&pag->pag_ref) == 0); + kmem_free(pag); + } + +@@ -145,7 +144,7 @@ xfs_free_perag( + pag = radix_tree_delete(&mp->m_perag_tree, agno); + spin_unlock(&mp->m_perag_lock); + ASSERT(pag); +- ASSERT(atomic_read(&pag->pag_ref) == 0); ++ XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); + xfs_iunlink_destroy(pag); + xfs_buf_hash_destroy(pag); + call_rcu(&pag->rcu_head, __xfs_free_perag); +diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c +index f3082a957d5e1..053b99929f835 100644 +--- a/fs/xfs/xfs_pnfs.c ++++ b/fs/xfs/xfs_pnfs.c +@@ -164,10 +164,12 @@ xfs_fs_map_blocks( + * that the blocks allocated and handed out to the client are + * guaranteed to be present even after a server crash. + */ +- error = xfs_update_prealloc_flags(ip, +- XFS_PREALLOC_SET | XFS_PREALLOC_SYNC); ++ error = xfs_update_prealloc_flags(ip, XFS_PREALLOC_SET); ++ if (!error) ++ error = xfs_log_force_inode(ip); + if (error) + goto out_unlock; ++ + } else { + xfs_iunlock(ip, lock_flags); + } +@@ -283,7 +285,8 @@ xfs_fs_commit_blocks( + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + +- xfs_setattr_time(ip, iattr); ++ ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); ++ setattr_copy(inode, iattr); + if (update_isize) { + i_size_write(inode, iattr->ia_size); + ip->i_d.di_size = iattr->ia_size; +diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c +index 64e5da33733b9..3c17e0c0f8169 100644 +--- a/fs/xfs/xfs_qm.c ++++ b/fs/xfs/xfs_qm.c +@@ -1318,8 +1318,15 @@ xfs_qm_quotacheck( + + error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true, + NULL); +- if (error) ++ if (error) { ++ /* ++ * The inode walk may have partially populated the dquot ++ * caches. We must purge them before disabling quota and ++ * tearing down the quotainfo, or else the dquots will leak. ++ */ ++ xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); + goto error_return; ++ } + + /* + * We've made all the changes that we need to make incore. Flush them +diff --git a/include/drm/drm_bridge.h b/include/drm/drm_bridge.h +index 2195daa289d27..055486e35e68f 100644 +--- a/include/drm/drm_bridge.h ++++ b/include/drm/drm_bridge.h +@@ -427,11 +427,11 @@ struct drm_bridge_funcs { + * + * The returned array must be allocated with kmalloc() and will be + * freed by the caller. If the allocation fails, NULL should be +- * returned. num_output_fmts must be set to the returned array size. ++ * returned. num_input_fmts must be set to the returned array size. + * Formats listed in the returned array should be listed in decreasing + * preference order (the core will try all formats until it finds one + * that works). When the format is not supported NULL should be +- * returned and num_output_fmts should be set to 0. ++ * returned and num_input_fmts should be set to 0. + * + * This method is called on all elements of the bridge chain as part of + * the bus format negotiation process that happens in +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 74e19bccbf738..8ce9e5c61ede8 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -1768,6 +1768,7 @@ extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, + extern void inode_init_owner(struct inode *inode, const struct inode *dir, + umode_t mode); + extern bool may_open_dev(const struct path *path); ++umode_t mode_strip_sgid(const struct inode *dir, umode_t mode); + + /* + * This is the "filldir" function type, used by readdir() to let +@@ -2959,7 +2960,7 @@ extern void __destroy_inode(struct inode *); + extern struct inode *new_inode_pseudo(struct super_block *sb); + extern struct inode *new_inode(struct super_block *sb); + extern void free_inode_nonrcu(struct inode *inode); +-extern int should_remove_suid(struct dentry *); ++extern int setattr_should_drop_suidgid(struct inode *); + extern int file_remove_privs(struct file *); + + extern void __insert_inode_hash(struct inode *, unsigned long hashval); +@@ -3407,7 +3408,7 @@ int __init get_filesystem_list(char *buf); + + static inline bool is_sxid(umode_t mode) + { +- return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP)); ++ return mode & (S_ISUID | S_ISGID); + } + + static inline int check_sticky(struct inode *dir, struct inode *inode) +diff --git a/include/linux/hid.h b/include/linux/hid.h +index 2ba33d708942c..256f34f49167c 100644 +--- a/include/linux/hid.h ++++ b/include/linux/hid.h +@@ -798,6 +798,7 @@ struct hid_driver { + * @raw_request: send raw report request to device (e.g. feature report) + * @output_report: send output report to device + * @idle: send idle request to device ++ * @max_buffer_size: over-ride maximum data buffer size (default: HID_MAX_BUFFER_SIZE) + */ + struct hid_ll_driver { + int (*start)(struct hid_device *hdev); +@@ -822,6 +823,8 @@ struct hid_ll_driver { + int (*output_report) (struct hid_device *hdev, __u8 *buf, size_t len); + + int (*idle)(struct hid_device *hdev, int report, int idle, int reqtype); ++ ++ unsigned int max_buffer_size; + }; + + extern struct hid_ll_driver i2c_hid_ll_driver; +diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h +index b478a16ef284d..9ef63bc14b002 100644 +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -270,9 +270,11 @@ struct hh_cache { + * relationship HH alignment <= LL alignment. + */ + #define LL_RESERVED_SPACE(dev) \ +- ((((dev)->hard_header_len+(dev)->needed_headroom)&~(HH_DATA_MOD - 1)) + HH_DATA_MOD) ++ ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom)) \ ++ & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD) + #define LL_RESERVED_SPACE_EXTRA(dev,extra) \ +- ((((dev)->hard_header_len+(dev)->needed_headroom+(extra))&~(HH_DATA_MOD - 1)) + HH_DATA_MOD) ++ ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom) + (extra)) \ ++ & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD) + + struct header_ops { + int (*create) (struct sk_buff *skb, struct net_device *dev, +diff --git a/include/linux/sh_intc.h b/include/linux/sh_intc.h +index c255273b02810..37ad81058d6ae 100644 +--- a/include/linux/sh_intc.h ++++ b/include/linux/sh_intc.h +@@ -97,7 +97,10 @@ struct intc_hw_desc { + unsigned int nr_subgroups; + }; + +-#define _INTC_ARRAY(a) a, __same_type(a, NULL) ? 0 : sizeof(a)/sizeof(*a) ++#define _INTC_SIZEOF_OR_ZERO(a) (_Generic(a, \ ++ typeof(NULL): 0, \ ++ default: sizeof(a))) ++#define _INTC_ARRAY(a) a, _INTC_SIZEOF_OR_ZERO(a)/sizeof(*a) + + #define INTC_HW_DESC(vectors, groups, mask_regs, \ + prio_regs, sense_regs, ack_regs) \ +diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h +index e4c5df71f0e74..4e1356c35fe62 100644 +--- a/include/linux/tracepoint.h ++++ b/include/linux/tracepoint.h +@@ -234,12 +234,11 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) + * not add unwanted padding between the beginning of the section and the + * structure. Force alignment to the same alignment as the section start. + * +- * When lockdep is enabled, we make sure to always do the RCU portions of +- * the tracepoint code, regardless of whether tracing is on. However, +- * don't check if the condition is false, due to interaction with idle +- * instrumentation. This lets us find RCU issues triggered with tracepoints +- * even when this tracepoint is off. This code has no purpose other than +- * poking RCU a bit. ++ * When lockdep is enabled, we make sure to always test if RCU is ++ * "watching" regardless if the tracepoint is enabled or not. Tracepoints ++ * require RCU to be active, and it should always warn at the tracepoint ++ * site if it is not watching, as it will need to be active when the ++ * tracepoint is enabled. + */ + #define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \ + extern int __traceiter_##name(data_proto); \ +@@ -253,9 +252,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) + TP_ARGS(data_args), \ + TP_CONDITION(cond), 0); \ + if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ +- rcu_read_lock_sched_notrace(); \ +- rcu_dereference_sched(__tracepoint_##name.funcs);\ +- rcu_read_unlock_sched_notrace(); \ ++ WARN_ON_ONCE(!rcu_is_watching()); \ + } \ + } \ + __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \ +diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c +index 445afda927f47..fd799567fc23a 100644 +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -5792,10 +5792,10 @@ static int io_arm_poll_handler(struct io_kiocb *req) + } + } else { + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); ++ if (unlikely(!apoll)) ++ return IO_APOLL_ABORTED; + apoll->poll.retries = APOLL_MAX_RETRY; + } +- if (unlikely(!apoll)) +- return IO_APOLL_ABORTED; + apoll->double_poll = NULL; + req->apoll = apoll; + req->flags |= REQ_F_POLLED; +diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c +index d97c189695cbb..67829b6e07bdc 100644 +--- a/kernel/trace/ftrace.c ++++ b/kernel/trace/ftrace.c +@@ -1538,7 +1538,8 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) + key.flags = end; /* overload flags, as it is unsigned long */ + + for (pg = ftrace_pages_start; pg; pg = pg->next) { +- if (end < pg->records[0].ip || ++ if (pg->index == 0 || ++ end < pg->records[0].ip || + start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) + continue; + rec = bsearch(&key, pg->records, pg->index, +diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c +index 8637eab2986ee..ce45bdd9077db 100644 +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -4705,6 +4705,8 @@ loff_t tracing_lseek(struct file *file, loff_t offset, int whence) + static const struct file_operations tracing_fops = { + .open = tracing_open, + .read = seq_read, ++ .read_iter = seq_read_iter, ++ .splice_read = generic_file_splice_read, + .write = tracing_write_stub, + .llseek = tracing_lseek, + .release = tracing_release, +diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c +index ccc99cd23f3c4..9ed65191888ef 100644 +--- a/kernel/trace/trace_events_hist.c ++++ b/kernel/trace/trace_events_hist.c +@@ -1087,6 +1087,9 @@ static const char *hist_field_name(struct hist_field *field, + { + const char *field_name = ""; + ++ if (WARN_ON_ONCE(!field)) ++ return field_name; ++ + if (level > 1) + return field_name; + +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index 9b15760e0541a..e4c690c21fc9c 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -1994,7 +1994,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, + { + struct mm_struct *mm = vma->vm_mm; + pgtable_t pgtable; +- pmd_t _pmd; ++ pmd_t _pmd, old_pmd; + int i; + + /* +@@ -2005,7 +2005,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, + * + * See Documentation/vm/mmu_notifier.rst + */ +- pmdp_huge_clear_flush(vma, haddr, pmd); ++ old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); + + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pmd_populate(mm, &_pmd, pgtable); +@@ -2014,6 +2014,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, + pte_t *pte, entry; + entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); + entry = pte_mkspecial(entry); ++ if (pmd_uffd_wp(old_pmd)) ++ entry = pte_mkuffd_wp(entry); + pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); +diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c +index 5f786ef662ead..41f890bf9d4c4 100644 +--- a/net/ipv4/fib_frontend.c ++++ b/net/ipv4/fib_frontend.c +@@ -573,6 +573,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, + cfg->fc_scope = RT_SCOPE_UNIVERSE; + } + ++ if (!cfg->fc_table) ++ cfg->fc_table = RT_TABLE_MAIN; ++ + if (cmd == SIOCDELRT) + return 0; + +diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c +index be75b409445c2..99f70b990eb13 100644 +--- a/net/ipv4/ip_tunnel.c ++++ b/net/ipv4/ip_tunnel.c +@@ -613,10 +613,10 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, + } + + headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; +- if (headroom > dev->needed_headroom) +- dev->needed_headroom = headroom; ++ if (headroom > READ_ONCE(dev->needed_headroom)) ++ WRITE_ONCE(dev->needed_headroom, headroom); + +- if (skb_cow_head(skb, dev->needed_headroom)) { ++ if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) { + ip_rt_put(rt); + goto tx_dropped; + } +@@ -797,10 +797,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, + + max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) + + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); +- if (max_headroom > dev->needed_headroom) +- dev->needed_headroom = max_headroom; ++ if (max_headroom > READ_ONCE(dev->needed_headroom)) ++ WRITE_ONCE(dev->needed_headroom, max_headroom); + +- if (skb_cow_head(skb, dev->needed_headroom)) { ++ if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) { + ip_rt_put(rt); + dev->stats.tx_dropped++; + kfree_skb(skb); +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index eefd032bc6dbd..e4ad274ec7a30 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -3609,7 +3609,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, + th->window = htons(min(req->rsk_rcv_wnd, 65535U)); + tcp_options_write((__be32 *)(th + 1), NULL, &opts); + th->doff = (tcp_header_size >> 2); +- __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); ++ TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); + + #ifdef CONFIG_TCP_MD5SIG + /* Okay, we have all we need - do the md5 hash if needed */ +diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c +index 0d4cab94c5dd2..a03a322e0cc1c 100644 +--- a/net/ipv6/ip6_tunnel.c ++++ b/net/ipv6/ip6_tunnel.c +@@ -1267,8 +1267,8 @@ route_lookup: + */ + max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr) + + dst->header_len + t->hlen; +- if (max_headroom > dev->needed_headroom) +- dev->needed_headroom = max_headroom; ++ if (max_headroom > READ_ONCE(dev->needed_headroom)) ++ WRITE_ONCE(dev->needed_headroom, max_headroom); + + err = ip6_tnl_encap(skb, t, &proto, fl6); + if (err) +diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c +index 349c6ac3313f7..6f84978a77265 100644 +--- a/net/iucv/iucv.c ++++ b/net/iucv/iucv.c +@@ -83,7 +83,7 @@ struct iucv_irq_data { + u16 ippathid; + u8 ipflags1; + u8 iptype; +- u32 res2[8]; ++ u32 res2[9]; + }; + + struct iucv_irq_list { +diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c +index 3b154ad4945c4..607519246bf28 100644 +--- a/net/mptcp/subflow.c ++++ b/net/mptcp/subflow.c +@@ -275,7 +275,6 @@ void mptcp_subflow_reset(struct sock *ssk) + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = subflow->conn; + +- tcp_set_state(ssk, TCP_CLOSE); + tcp_send_active_reset(ssk, GFP_ATOMIC); + tcp_done(ssk); + if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && +diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c +index 9953e80537536..1818dbf089cad 100644 +--- a/net/netfilter/nft_masq.c ++++ b/net/netfilter/nft_masq.c +@@ -43,7 +43,7 @@ static int nft_masq_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) + { +- u32 plen = sizeof_field(struct nf_nat_range, min_addr.all); ++ u32 plen = sizeof_field(struct nf_nat_range, min_proto.all); + struct nft_masq *priv = nft_expr_priv(expr); + int err; + +diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c +index db8f9116eeb43..cd4eb4996aff3 100644 +--- a/net/netfilter/nft_nat.c ++++ b/net/netfilter/nft_nat.c +@@ -226,7 +226,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + priv->flags |= NF_NAT_RANGE_MAP_IPS; + } + +- plen = sizeof_field(struct nf_nat_range, min_addr.all); ++ plen = sizeof_field(struct nf_nat_range, min_proto.all); + if (tb[NFTA_NAT_REG_PROTO_MIN]) { + err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MIN], + &priv->sreg_proto_min, plen); +diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c +index ba09890dddb50..e64f531d66cfc 100644 +--- a/net/netfilter/nft_redir.c ++++ b/net/netfilter/nft_redir.c +@@ -48,7 +48,7 @@ static int nft_redir_init(const struct nft_ctx *ctx, + unsigned int plen; + int err; + +- plen = sizeof_field(struct nf_nat_range, min_addr.all); ++ plen = sizeof_field(struct nf_nat_range, min_proto.all); + if (tb[NFTA_REDIR_REG_PROTO_MIN]) { + err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MIN], + &priv->sreg_proto_min, plen); +@@ -232,7 +232,7 @@ static struct nft_expr_type nft_redir_inet_type __read_mostly = { + .name = "redir", + .ops = &nft_redir_inet_ops, + .policy = nft_redir_policy, +- .maxattr = NFTA_MASQ_MAX, ++ .maxattr = NFTA_REDIR_MAX, + .owner = THIS_MODULE, + }; + +diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c +index 94503f36b9a61..9125d28d9ff5d 100644 +--- a/net/smc/smc_cdc.c ++++ b/net/smc/smc_cdc.c +@@ -104,6 +104,9 @@ int smc_cdc_msg_send(struct smc_connection *conn, + union smc_host_cursor cfed; + int rc; + ++ if (unlikely(!READ_ONCE(conn->sndbuf_desc))) ++ return -ENOBUFS; ++ + smc_cdc_add_pending_send(conn, pend); + + conn->tx_cdc_seq++; +diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c +index bf485a2017a4e..e84241ff4ac4f 100644 +--- a/net/smc/smc_core.c ++++ b/net/smc/smc_core.c +@@ -912,7 +912,7 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) + if (lgr->terminating) + return; /* lgr already terminating */ + /* cancel free_work sync, will terminate when lgr->freeing is set */ +- cancel_delayed_work_sync(&lgr->free_work); ++ cancel_delayed_work(&lgr->free_work); + lgr->terminating = 1; + + /* kill remaining link group connections */ +diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c +index fdbd56ed4bd52..ba73014805a4f 100644 +--- a/net/xfrm/xfrm_state.c ++++ b/net/xfrm/xfrm_state.c +@@ -2611,9 +2611,6 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) + if (inner_mode == NULL) + goto error; + +- if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL)) +- goto error; +- + x->inner_mode = *inner_mode; + + if (x->props.family == AF_INET) +diff --git a/sound/hda/intel-dsp-config.c b/sound/hda/intel-dsp-config.c +index 2a5ba9dca6b08..f96e70c85f84a 100644 +--- a/sound/hda/intel-dsp-config.c ++++ b/sound/hda/intel-dsp-config.c +@@ -359,6 +359,15 @@ static const struct config_entry config_table[] = { + }, + #endif + ++/* Meteor Lake */ ++#if IS_ENABLED(CONFIG_SND_SOC_SOF_METEORLAKE) ++ /* Meteorlake-P */ ++ { ++ .flags = FLAG_SOF | FLAG_SOF_ONLY_IF_DMIC_OR_SOUNDWIRE, ++ .device = 0x7e28, ++ }, ++#endif ++ + }; + + static const struct config_entry *snd_intel_dsp_find_config +diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c +index 494bfd2135a9e..de1fe604905f3 100644 +--- a/sound/pci/hda/hda_intel.c ++++ b/sound/pci/hda/hda_intel.c +@@ -365,14 +365,15 @@ enum { + #define needs_eld_notify_link(chip) false + #endif + +-#define CONTROLLER_IN_GPU(pci) (((pci)->device == 0x0a0c) || \ ++#define CONTROLLER_IN_GPU(pci) (((pci)->vendor == 0x8086) && \ ++ (((pci)->device == 0x0a0c) || \ + ((pci)->device == 0x0c0c) || \ + ((pci)->device == 0x0d0c) || \ + ((pci)->device == 0x160c) || \ + ((pci)->device == 0x490d) || \ + ((pci)->device == 0x4f90) || \ + ((pci)->device == 0x4f91) || \ +- ((pci)->device == 0x4f92)) ++ ((pci)->device == 0x4f92))) + + #define IS_BXT(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0x5a98) + +diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c +index f2ef75c8de427..2cf6600c9ca83 100644 +--- a/sound/pci/hda/patch_realtek.c ++++ b/sound/pci/hda/patch_realtek.c +@@ -9091,6 +9091,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { + SND_PCI_QUIRK(0x144d, 0xc830, "Samsung Galaxy Book Ion (NT950XCJ-X716A)", ALC298_FIXUP_SAMSUNG_AMP), + SND_PCI_QUIRK(0x144d, 0xc832, "Samsung Galaxy Book Flex Alpha (NP730QCJ)", ALC256_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET), + SND_PCI_QUIRK(0x144d, 0xca03, "Samsung Galaxy Book2 Pro 360 (NP930QED)", ALC298_FIXUP_SAMSUNG_AMP), ++ SND_PCI_QUIRK(0x144d, 0xc868, "Samsung Galaxy Book2 Pro (NP930XED)", ALC298_FIXUP_SAMSUNG_AMP), + SND_PCI_QUIRK(0x1458, 0xfa53, "Gigabyte BXBT-2807", ALC283_FIXUP_HEADSET_MIC), + SND_PCI_QUIRK(0x1462, 0xb120, "MSI Cubi MS-B120", ALC283_FIXUP_HEADSET_MIC), + SND_PCI_QUIRK(0x1462, 0xb171, "Cubi N 8GL (MS-B171)", ALC283_FIXUP_HEADSET_MIC), +diff --git a/tools/testing/selftests/net/devlink_port_split.py b/tools/testing/selftests/net/devlink_port_split.py +index 834066d465fc1..f0fbd7367f4f6 100755 +--- a/tools/testing/selftests/net/devlink_port_split.py ++++ b/tools/testing/selftests/net/devlink_port_split.py +@@ -57,6 +57,8 @@ class devlink_ports(object): + assert stderr == "" + ports = json.loads(stdout)['port'] + ++ validate_devlink_output(ports, 'flavour') ++ + for port in ports: + if dev in port: + if ports[port]['flavour'] == 'physical': +@@ -218,6 +220,27 @@ def split_splittable_port(port, k, lanes, dev): + unsplit(port.bus_info) + + ++def validate_devlink_output(devlink_data, target_property=None): ++ """ ++ Determine if test should be skipped by checking: ++ 1. devlink_data contains values ++ 2. The target_property exist in devlink_data ++ """ ++ skip_reason = None ++ if any(devlink_data.values()): ++ if target_property: ++ skip_reason = "{} not found in devlink output, test skipped".format(target_property) ++ for key in devlink_data: ++ if target_property in devlink_data[key]: ++ skip_reason = None ++ else: ++ skip_reason = 'devlink output is empty, test skipped' ++ ++ if skip_reason: ++ print(skip_reason) ++ sys.exit(KSFT_SKIP) ++ ++ + def make_parser(): + parser = argparse.ArgumentParser(description='A test for port splitting.') + parser.add_argument('--dev', +@@ -238,6 +261,7 @@ def main(cmdline=None): + stdout, stderr = run_command(cmd) + assert stderr == "" + ++ validate_devlink_output(json.loads(stdout)) + devs = json.loads(stdout)['dev'] + dev = list(devs.keys())[0] + +@@ -249,6 +273,7 @@ def main(cmdline=None): + + ports = devlink_ports(dev) + ++ found_max_lanes = False + for port in ports.if_names: + max_lanes = get_max_lanes(port.name) + +@@ -271,6 +296,11 @@ def main(cmdline=None): + split_splittable_port(port, lane, max_lanes, dev) + + lane //= 2 ++ found_max_lanes = True ++ ++ if not found_max_lanes: ++ print(f"Test not started, no port of device {dev} reports max_lanes") ++ sys.exit(KSFT_SKIP) + + + if __name__ == "__main__": |