diff --git a/Documentation/block/null_blk.rst b/Documentation/block/null_blk.rst
index edbbab2f12f8..4dd78f24d10a 100644
--- a/Documentation/block/null_blk.rst
+++ b/Documentation/block/null_blk.rst
@@ -72,6 +72,28 @@ submit_queues=[1..nr_cpus]: Default: 1
 hw_queue_depth=[0..qdepth]: Default: 64
   The hardware queue depth of the device.
 
+memory_backed=[0/1]: Default: 0
+  Whether or not to use a memory buffer to respond to IO requests
+
+  =  =============================================
+  0  Transfer no data in response to IO requests
+  1  Use a memory buffer to respond to IO requests
+  =  =============================================
+
+discard=[0/1]: Default: 0
+  Support discard operations (requires memory-backed null_blk device).
+
+  =  =====================================
+  0  Do not support discard operations
+  1  Enable support for discard operations
+  =  =====================================
+
+cache_size=[Size in MB]: Default: 0
+  Cache size in MB for memory-backed device.
+
+mbps=[Maximum bandwidth in MB/s]: Default: 0 (no limit)
+  Bandwidth limit for device performance.
+
 Multi-queue specific parameters
 -------------------------------
 
diff --git a/Documentation/filesystems/ext4/blockmap.rst b/Documentation/filesystems/ext4/blockmap.rst
index 2bd990402a5c..cc596541ce79 100644
--- a/Documentation/filesystems/ext4/blockmap.rst
+++ b/Documentation/filesystems/ext4/blockmap.rst
@@ -1,7 +1,7 @@
 .. SPDX-License-Identifier: GPL-2.0
 
 +---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| i.i_block Offset   | Where It Points                                                                                                                                                                                                              |
+| i.i_block Offset    | Where It Points                                                                                                                                                                                                              |
 +=====================+==============================================================================================================================================================================================================================+
 | 0 to 11             | Direct map to file blocks 0 to 11.                                                                                                                                                                                           |
 +---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
diff --git a/Documentation/filesystems/xfs-delayed-logging-design.rst b/Documentation/filesystems/xfs-delayed-logging-design.rst
index 464405d2801e..4ef419f54663 100644
--- a/Documentation/filesystems/xfs-delayed-logging-design.rst
+++ b/Documentation/filesystems/xfs-delayed-logging-design.rst
@@ -1,29 +1,314 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-==========================
-XFS Delayed Logging Design
-==========================
+==================
+XFS Logging Design
+==================
 
-Introduction to Re-logging in XFS
-=================================
+Preamble
+========
 
-XFS logging is a combination of logical and physical logging. Some objects,
-such as inodes and dquots, are logged in logical format where the details
-logged are made up of the changes to in-core structures rather than on-disk
-structures. Other objects - typically buffers - have their physical changes
-logged. The reason for these differences is to reduce the amount of log space
-required for objects that are frequently logged. Some parts of inodes are more
-frequently logged than others, and inodes are typically more frequently logged
-than any other object (except maybe the superblock buffer) so keeping the
-amount of metadata logged low is of prime importance.
+This document describes the design and algorithms that the XFS journalling
+subsystem is based on. This document describes the design and algorithms that
+the XFS journalling subsystem is based on so that readers may familiarize
+themselves with the general concepts of how transaction processing in XFS works.
 
-The reason that this is such a concern is that XFS allows multiple separate
-modifications to a single object to be carried in the log at any given time.
-This allows the log to avoid needing to flush each change to disk before
-recording a new change to the object. XFS does this via a method called
-"re-logging". Conceptually, this is quite simple - all it requires is that any
-new change to the object is recorded with a *new copy* of all the existing
-changes in the new transaction that is written to the log.
+We begin with an overview of transactions in XFS, followed by describing how
+transaction reservations are structured and accounted, and then move into how we
+guarantee forwards progress for long running transactions with finite initial
+reservations bounds. At this point we need to explain how relogging works. With
+the basic concepts covered, the design of the delayed logging mechanism is
+documented.
+
+
+Introduction
+============
+
+XFS uses Write Ahead Logging for ensuring changes to the filesystem metadata
+are atomic and recoverable. For reasons of space and time efficiency, the
+logging mechanisms are varied and complex, combining intents, logical and
+physical logging mechanisms to provide the necessary recovery guarantees the
+filesystem requires.
+
+Some objects, such as inodes and dquots, are logged in logical format where the
+details logged are made up of the changes to in-core structures rather than
+on-disk structures. Other objects - typically buffers - have their physical
+changes logged. Long running atomic modifications have individual changes
+chained together by intents, ensuring that journal recovery can restart and
+finish an operation that was only partially done when the system stopped
+functioning.
+
+The reason for these differences is to keep the amount of log space and CPU time
+required to process objects being modified as small as possible and hence the
+logging overhead as low as possible. Some items are very frequently modified,
+and some parts of objects are more frequently modified than others, so keeping
+the overhead of metadata logging low is of prime importance.
+
+The method used to log an item or chain modifications together isn't
+particularly important in the scope of this document. It suffices to know that
+the method used for logging a particular object or chaining modifications
+together are different and are dependent on the object and/or modification being
+performed. The logging subsystem only cares that certain specific rules are
+followed to guarantee forwards progress and prevent deadlocks.
+
+
+Transactions in XFS
+===================
+
+XFS has two types of high level transactions, defined by the type of log space
+reservation they take. These are known as "one shot" and "permanent"
+transactions. Permanent transaction reservations can take reservations that span
+commit boundaries, whilst "one shot" transactions are for a single atomic
+modification.
+
+The type and size of reservation must be matched to the modification taking
+place.  This means that permanent transactions can be used for one-shot
+modifications, but one-shot reservations cannot be used for permanent
+transactions.
+
+In the code, a one-shot transaction pattern looks somewhat like this::
+
+	tp = xfs_trans_alloc(<reservation>)
+	<lock items>
+	<join item to transaction>
+	<do modification>
+	xfs_trans_commit(tp);
+
+As items are modified in the transaction, the dirty regions in those items are
+tracked via the transaction handle.  Once the transaction is committed, all
+resources joined to it are released, along with the remaining unused reservation
+space that was taken at the transaction allocation time.
+
+In contrast, a permanent transaction is made up of multiple linked individual
+transactions, and the pattern looks like this::
+
+	tp = xfs_trans_alloc(<reservation>)
+	xfs_ilock(ip, XFS_ILOCK_EXCL)
+
+	loop {
+		xfs_trans_ijoin(tp, 0);
+		<do modification>
+		xfs_trans_log_inode(tp, ip);
+		xfs_trans_roll(&tp);
+	}
+
+	xfs_trans_commit(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+While this might look similar to a one-shot transaction, there is an important
+difference: xfs_trans_roll() performs a specific operation that links two
+transactions together::
+
+	ntp = xfs_trans_dup(tp);
+	xfs_trans_commit(tp);
+	xfs_log_reserve(ntp);
+
+This results in a series of "rolling transactions" where the inode is locked
+across the entire chain of transactions.  Hence while this series of rolling
+transactions is running, nothing else can read from or write to the inode and
+this provides a mechanism for complex changes to appear atomic from an external
+observer's point of view.
+
+It is important to note that a series of rolling transactions in a permanent
+transaction does not form an atomic change in the journal. While each
+individual modification is atomic, the chain is *not atomic*. If we crash half
+way through, then recovery will only replay up to the last transactional
+modification the loop made that was committed to the journal.
+
+This affects long running permanent transactions in that it is not possible to
+predict how much of a long running operation will actually be recovered because
+there is no guarantee of how much of the operation reached stale storage. Hence
+if a long running operation requires multiple transactions to fully complete,
+the high level operation must use intents and deferred operations to guarantee
+recovery can complete the operation once the first transactions is persisted in
+the on-disk journal.
+
+
+Transactions are Asynchronous
+=============================
+
+In XFS, all high level transactions are asynchronous by default. This means that
+xfs_trans_commit() does not guarantee that the modification has been committed
+to stable storage when it returns. Hence when a system crashes, not all the
+completed transactions will be replayed during recovery.
+
+However, the logging subsystem does provide global ordering guarantees, such
+that if a specific change is seen after recovery, all metadata modifications
+that were committed prior to that change will also be seen.
+
+For single shot operations that need to reach stable storage immediately, or
+ensuring that a long running permanent transaction is fully committed once it is
+complete, we can explicitly tag a transaction as synchronous. This will trigger
+a "log force" to flush the outstanding committed transactions to stable storage
+in the journal and wait for that to complete.
+
+Synchronous transactions are rarely used, however, because they limit logging
+throughput to the IO latency limitations of the underlying storage. Instead, we
+tend to use log forces to ensure modifications are on stable storage only when
+a user operation requires a synchronisation point to occur (e.g. fsync).
+
+
+Transaction Reservations
+========================
+
+It has been mentioned a number of times now that the logging subsystem needs to
+provide a forwards progress guarantee so that no modification ever stalls
+because it can't be written to the journal due to a lack of space in the
+journal. This is achieved by the transaction reservations that are made when
+a transaction is first allocated. For permanent transactions, these reservations
+are maintained as part of the transaction rolling mechanism.
+
+A transaction reservation provides a guarantee that there is physical log space
+available to write the modification into the journal before we start making
+modifications to objects and items. As such, the reservation needs to be large
+enough to take into account the amount of metadata that the change might need to
+log in the worst case. This means that if we are modifying a btree in the
+transaction, we have to reserve enough space to record a full leaf-to-root split
+of the btree. As such, the reservations are quite complex because we have to
+take into account all the hidden changes that might occur.
+
+For example, a user data extent allocation involves allocating an extent from
+free space, which modifies the free space trees. That's two btrees.  Inserting
+the extent into the inode's extent map might require a split of the extent map
+btree, which requires another allocation that can modify the free space trees
+again.  Then we might have to update reverse mappings, which modifies yet
+another btree which might require more space. And so on.  Hence the amount of
+metadata that a "simple" operation can modify can be quite large.
+
+This "worst case" calculation provides us with the static "unit reservation"
+for the transaction that is calculated at mount time. We must guarantee that the
+log has this much space available before the transaction is allowed to proceed
+so that when we come to write the dirty metadata into the log we don't run out
+of log space half way through the write.
+
+For one-shot transactions, a single unit space reservation is all that is
+required for the transaction to proceed. For permanent transactions, however, we
+also have a "log count" that affects the size of the reservation that is to be
+made.
+
+While a permanent transaction can get by with a single unit of space
+reservation, it is somewhat inefficient to do this as it requires the
+transaction rolling mechanism to re-reserve space on every transaction roll. We
+know from the implementation of the permanent transactions how many transaction
+rolls are likely for the common modifications that need to be made.
+
+For example, and inode allocation is typically two transactions - one to
+physically allocate a free inode chunk on disk, and another to allocate an inode
+from an inode chunk that has free inodes in it.  Hence for an inode allocation
+transaction, we might set the reservation log count to a value of 2 to indicate
+that the common/fast path transaction will commit two linked transactions in a
+chain. Each time a permanent transaction rolls, it consumes an entire unit
+reservation.
+
+Hence when the permanent transaction is first allocated, the log space
+reservation is increases from a single unit reservation to multiple unit
+reservations. That multiple is defined by the reservation log count, and this
+means we can roll the transaction multiple times before we have to re-reserve
+log space when we roll the transaction. This ensures that the common
+modifications we make only need to reserve log space once.
+
+If the log count for a permanent transaction reaches zero, then it needs to
+re-reserve physical space in the log. This is somewhat complex, and requires
+an understanding of how the log accounts for space that has been reserved.
+
+
+Log Space Accounting
+====================
+
+The position in the log is typically referred to as a Log Sequence Number (LSN).
+The log is circular, so the positions in the log are defined by the combination
+of a cycle number - the number of times the log has been overwritten - and the
+offset into the log.  A LSN carries the cycle in the upper 32 bits and the
+offset in the lower 32 bits. The offset is in units of "basic blocks" (512
+bytes). Hence we can do realtively simple LSN based math to keep track of
+available space in the log.
+
+Log space accounting is done via a pair of constructs called "grant heads".  The
+position of the grant heads is an absolute value, so the amount of space
+available in the log is defined by the distance between the position of the
+grant head and the current log tail. That is, how much space can be
+reserved/consumed before the grant heads would fully wrap the log and overtake
+the tail position.
+
+The first grant head is the "reserve" head. This tracks the byte count of the
+reservations currently held by active transactions. It is a purely in-memory
+accounting of the space reservation and, as such, actually tracks byte offsets
+into the log rather than basic blocks. Hence it technically isn't using LSNs to
+represent the log position, but it is still treated like a split {cycle,offset}
+tuple for the purposes of tracking reservation space.
+
+The reserve grant head is used to accurately account for exact transaction
+reservations amounts and the exact byte count that modifications actually make
+and need to write into the log. The reserve head is used to prevent new
+transactions from taking new reservations when the head reaches the current
+tail. It will block new reservations in a FIFO queue and as the log tail moves
+forward it will wake them in order once sufficient space is available. This FIFO
+mechanism ensures no transaction is starved of resources when log space
+shortages occur.
+
+The other grant head is the "write" head. Unlike the reserve head, this grant
+head contains an LSN and it tracks the physical space usage in the log. While
+this might sound like it is accounting the same state as the reserve grant head
+- and it mostly does track exactly the same location as the reserve grant head -
+there are critical differences in behaviour between them that provides the
+forwards progress guarantees that rolling permanent transactions require.
+
+These differences when a permanent transaction is rolled and the internal "log
+count" reaches zero and the initial set of unit reservations have been
+exhausted. At this point, we still require a log space reservation to continue
+the next transaction in the sequeunce, but we have none remaining. We cannot
+sleep during the transaction commit process waiting for new log space to become
+available, as we may end up on the end of the FIFO queue and the items we have
+locked while we sleep could end up pinning the tail of the log before there is
+enough free space in the log to fulfil all of the pending reservations and
+then wake up transaction commit in progress.
+
+To take a new reservation without sleeping requires us to be able to take a
+reservation even if there is no reservation space currently available. That is,
+we need to be able to *overcommit* the log reservation space. As has already
+been detailed, we cannot overcommit physical log space. However, the reserve
+grant head does not track physical space - it only accounts for the amount of
+reservations we currently have outstanding. Hence if the reserve head passes
+over the tail of the log all it means is that new reservations will be throttled
+immediately and remain throttled until the log tail is moved forward far enough
+to remove the overcommit and start taking new reservations. In other words, we
+can overcommit the reserve head without violating the physical log head and tail
+rules.
+
+As a result, permanent transactions only "regrant" reservation space during
+xfs_trans_commit() calls, while the physical log space reservation - tracked by
+the write head - is then reserved separately by a call to xfs_log_reserve()
+after the commit completes. Once the commit completes, we can sleep waiting for
+physical log space to be reserved from the write grant head, but only if one
+critical rule has been observed::
+
+	Code using permanent reservations must always log the items they hold
+	locked across each transaction they roll in the chain.
+
+"Re-logging" the locked items on every transaction roll ensures that the items
+attached to the transaction chain being rolled are always relocated to the
+physical head of the log and so do not pin the tail of the log. If a locked item
+pins the tail of the log when we sleep on the write reservation, then we will
+deadlock the log as we cannot take the locks needed to write back that item and
+move the tail of the log forwards to free up write grant space. Re-logging the
+locked items avoids this deadlock and guarantees that the log reservation we are
+making cannot self-deadlock.
+
+If all rolling transactions obey this rule, then they can all make forwards
+progress independently because nothing will block the progress of the log
+tail moving forwards and hence ensuring that write grant space is always
+(eventually) made available to permanent transactions no matter how many times
+they roll.
+
+
+Re-logging Explained
+====================
+
+XFS allows multiple separate modifications to a single object to be carried in
+the log at any given time.  This allows the log to avoid needing to flush each
+change to disk before recording a new change to the object. XFS does this via a
+method called "re-logging". Conceptually, this is quite simple - all it requires
+is that any new change to the object is recorded with a *new copy* of all the
+existing changes in the new transaction that is written to the log.
 
 That is, if we have a sequence of changes A through to F, and the object was
 written to disk after change D, we would see in the log the following series
@@ -42,16 +327,13 @@ transaction::
 In other words, each time an object is relogged, the new transaction contains
 the aggregation of all the previous changes currently held only in the log.
 
-This relogging technique also allows objects to be moved forward in the log so
-that an object being relogged does not prevent the tail of the log from ever
-moving forward.  This can be seen in the table above by the changing
-(increasing) LSN of each subsequent transaction - the LSN is effectively a
-direct encoding of the location in the log of the transaction.
+This relogging technique allows objects to be moved forward in the log so that
+an object being relogged does not prevent the tail of the log from ever moving
+forward.  This can be seen in the table above by the changing (increasing) LSN
+of each subsequent transaction, and it's the technique that allows us to
+implement long-running, multiple-commit permanent transactions. 
 
-This relogging is also used to implement long-running, multiple-commit
-transactions.  These transaction are known as rolling transactions, and require
-a special log reservation known as a permanent transaction reservation. A
-typical example of a rolling transaction is the removal of extents from an
+A typical example of a rolling transaction is the removal of extents from an
 inode which can only be done at a rate of two extents per transaction because
 of reservation size limitations. Hence a rolling extent removal transaction
 keeps relogging the inode and btree buffers as they get modified in each
@@ -67,12 +349,13 @@ the log over and over again. Worse is the fact that objects tend to get
 dirtier as they get relogged, so each subsequent transaction is writing more
 metadata into the log.
 
-Another feature of the XFS transaction subsystem is that most transactions are
-asynchronous. That is, they don't commit to disk until either a log buffer is
-filled (a log buffer can hold multiple transactions) or a synchronous operation
-forces the log buffers holding the transactions to disk. This means that XFS is
-doing aggregation of transactions in memory - batching them, if you like - to
-minimise the impact of the log IO on transaction throughput.
+It should now also be obvious how relogging and asynchronous transactions go
+hand in hand. That is, transactions don't get written to the physical journal
+until either a log buffer is filled (a log buffer can hold multiple
+transactions) or a synchronous operation forces the log buffers holding the
+transactions to disk. This means that XFS is doing aggregation of transactions
+in memory - batching them, if you like - to minimise the impact of the log IO on
+transaction throughput.
 
 The limitation on asynchronous transaction throughput is the number and size of
 log buffers made available by the log manager. By default there are 8 log
diff --git a/MAINTAINERS b/MAINTAINERS
index 840167074402..ba213b20508f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -736,6 +736,14 @@ S:	Maintained
 F:	Documentation/i2c/busses/i2c-ali1563.rst
 F:	drivers/i2c/busses/i2c-ali1563.c
 
+ALIBABA ELASTIC RDMA DRIVER
+M:	Cheng Xu <chengyou@linux.alibaba.com>
+M:	Kai Shen <kaishen@linux.alibaba.com>
+L:	linux-rdma@vger.kernel.org
+S:	Supported
+F:	drivers/infiniband/hw/erdma
+F:	include/uapi/rdma/erdma-abi.h
+
 ALIENWARE WMI DRIVER
 L:	Dell.Client.Kernel@dell.com
 S:	Maintained
@@ -14506,7 +14514,8 @@ S:	Supported
 W:	http://git.infradead.org/nvme.git
 T:	git://git.infradead.org/nvme.git
 F:	drivers/nvme/host/
-F:	include/linux/nvme.h
+F:	drivers/nvme/common/
+F:	include/linux/nvme*
 F:	include/uapi/linux/nvme_ioctl.h
 
 NVM EXPRESS FC TRANSPORT DRIVERS
@@ -18837,6 +18846,7 @@ SOFTWARE RAID (Multiple Disks) SUPPORT
 M:	Song Liu <song@kernel.org>
 L:	linux-raid@vger.kernel.org
 S:	Supported
+Q:	https://patchwork.kernel.org/project/linux-raid/list/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git
 F:	drivers/md/Kconfig
 F:	drivers/md/Makefile
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 32929c89ba8a..3f5685c00e36 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -134,7 +134,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 	iv = bip->bip_vec + bip->bip_vcnt;
 
 	if (bip->bip_vcnt &&
-	    bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
+	    bvec_gap_to_prev(&bdev_get_queue(bio->bi_bdev)->limits,
 			     &bip->bip_vec[bip->bip_vcnt - 1], offset))
 		return 0;
 
diff --git a/block/bio.c b/block/bio.c
index 9a14f37c609f..96bca5b8cc92 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -968,7 +968,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		 * would create a gap, disallow it.
 		 */
 		bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
-		if (bvec_gap_to_prev(q, bvec, offset))
+		if (bvec_gap_to_prev(&q->limits, bvec, offset))
 			return 0;
 	}
 
@@ -1154,22 +1154,12 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
 	bio_set_flag(bio, BIO_CLONED);
 }
 
-static void bio_put_pages(struct page **pages, size_t size, size_t off)
-{
-	size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
-
-	for (i = 0; i < nr; i++)
-		put_page(pages[i]);
-}
-
 static int bio_iov_add_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int offset)
 {
 	bool same_page = false;
 
 	if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
-		if (WARN_ON_ONCE(bio_full(bio, len)))
-			return -EINVAL;
 		__bio_add_page(bio, page, len, offset);
 		return 0;
 	}
@@ -1212,8 +1202,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
 	struct page **pages = (struct page **)bv;
 	ssize_t size, left;
-	unsigned len, i;
+	unsigned len, i = 0;
 	size_t offset;
+	int ret = 0;
 
 	/*
 	 * Move page array up in the allocated memory for the bio vecs as far as
@@ -1230,32 +1221,40 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	 * result to ensure the bio's total size is correct. The remainder of
 	 * the iov data will be picked up in the next bio iteration.
 	 */
-	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
-	if (size > 0)
+	size = iov_iter_get_pages(iter, pages, UINT_MAX - bio->bi_iter.bi_size,
+				  nr_pages, &offset);
+	if (size > 0) {
+		nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
 		size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
-	if (unlikely(size <= 0))
-		return size ? size : -EFAULT;
+	} else
+		nr_pages = 0;
+
+	if (unlikely(size <= 0)) {
+		ret = size ? size : -EFAULT;
+		goto out;
+	}
 
 	for (left = size, i = 0; left > 0; left -= len, i++) {
 		struct page *page = pages[i];
-		int ret;
 
 		len = min_t(size_t, PAGE_SIZE - offset, left);
-		if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
 			ret = bio_iov_add_zone_append_page(bio, page, len,
 					offset);
-		else
-			ret = bio_iov_add_page(bio, page, len, offset);
+			if (ret)
+				break;
+		} else
+			bio_iov_add_page(bio, page, len, offset);
 
-		if (ret) {
-			bio_put_pages(pages + i, left, offset);
-			return ret;
-		}
 		offset = 0;
 	}
 
-	iov_iter_advance(iter, size);
-	return 0;
+	iov_iter_advance(iter, size - left);
+out:
+	while (i < nr_pages)
+		put_page(pages[i++]);
+
+	return ret;
 }
 
 /**
diff --git a/block/blk-core.c b/block/blk-core.c
index 3d286a256d3d..a0d1104c5590 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -377,7 +377,6 @@ static void blk_timeout_work(struct work_struct *work)
 struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
 {
 	struct request_queue *q;
-	int ret;
 
 	q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
 			GFP_KERNEL | __GFP_ZERO, node_id);
@@ -396,13 +395,9 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
 	if (q->id < 0)
 		goto fail_srcu;
 
-	ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
-	if (ret)
-		goto fail_id;
-
 	q->stats = blk_alloc_queue_stats();
 	if (!q->stats)
-		goto fail_split;
+		goto fail_id;
 
 	q->node = node_id;
 
@@ -439,8 +434,6 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
 
 fail_stats:
 	blk_free_queue_stats(q->stats);
-fail_split:
-	bioset_exit(&q->bio_split);
 fail_id:
 	ida_free(&blk_queue_ida, q->id);
 fail_srcu:
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4c8a699754c9..ff04e9290715 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -82,7 +82,7 @@ static inline bool bio_will_gap(struct request_queue *q,
 	bio_get_first_bvec(next, &nb);
 	if (biovec_phys_mergeable(q, &pb, &nb))
 		return false;
-	return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
+	return __bvec_gap_to_prev(&q->limits, &pb, nb.bv_offset);
 }
 
 static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
@@ -95,23 +95,30 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
 	return bio_will_gap(req->q, NULL, bio, req->bio);
 }
 
-static struct bio *blk_bio_discard_split(struct request_queue *q,
-					 struct bio *bio,
-					 struct bio_set *bs,
-					 unsigned *nsegs)
+/*
+ * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size
+ * is defined as 'unsigned int', meantime it has to be aligned to with the
+ * logical block size, which is the minimum accepted unit by hardware.
+ */
+static unsigned int bio_allowed_max_sectors(struct queue_limits *lim)
+{
+	return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT;
+}
+
+static struct bio *bio_split_discard(struct bio *bio, struct queue_limits *lim,
+		unsigned *nsegs, struct bio_set *bs)
 {
 	unsigned int max_discard_sectors, granularity;
-	int alignment;
 	sector_t tmp;
 	unsigned split_sectors;
 
 	*nsegs = 1;
 
 	/* Zero-sector (unknown) and one-sector granularities are the same.  */
-	granularity = max(q->limits.discard_granularity >> 9, 1U);
+	granularity = max(lim->discard_granularity >> 9, 1U);
 
-	max_discard_sectors = min(q->limits.max_discard_sectors,
-			bio_allowed_max_sectors(q));
+	max_discard_sectors =
+		min(lim->max_discard_sectors, bio_allowed_max_sectors(lim));
 	max_discard_sectors -= max_discard_sectors % granularity;
 
 	if (unlikely(!max_discard_sectors)) {
@@ -128,9 +135,8 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
 	 * If the next starting sector would be misaligned, stop the discard at
 	 * the previous aligned sector.
 	 */
-	alignment = (q->limits.discard_alignment >> 9) % granularity;
-
-	tmp = bio->bi_iter.bi_sector + split_sectors - alignment;
+	tmp = bio->bi_iter.bi_sector + split_sectors -
+		((lim->discard_alignment >> 9) % granularity);
 	tmp = sector_div(tmp, granularity);
 
 	if (split_sectors > tmp)
@@ -139,18 +145,15 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
 	return bio_split(bio, split_sectors, GFP_NOIO, bs);
 }
 
-static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
-		struct bio *bio, struct bio_set *bs, unsigned *nsegs)
+static struct bio *bio_split_write_zeroes(struct bio *bio,
+		struct queue_limits *lim, unsigned *nsegs, struct bio_set *bs)
 {
 	*nsegs = 0;
-
-	if (!q->limits.max_write_zeroes_sectors)
+	if (!lim->max_write_zeroes_sectors)
 		return NULL;
-
-	if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors)
+	if (bio_sectors(bio) <= lim->max_write_zeroes_sectors)
 		return NULL;
-
-	return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
+	return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
 }
 
 /*
@@ -161,17 +164,17 @@ static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
  * requests that are submitted to a block device if the start of a bio is not
  * aligned to a physical block boundary.
  */
-static inline unsigned get_max_io_size(struct request_queue *q,
-				       struct bio *bio)
+static inline unsigned get_max_io_size(struct bio *bio,
+		struct queue_limits *lim)
 {
-	unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
-	unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
-	unsigned max_sectors = queue_max_sectors(q), start, end;
+	unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT;
+	unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT;
+	unsigned max_sectors = lim->max_sectors, start, end;
 
-	if (q->limits.chunk_sectors) {
+	if (lim->chunk_sectors) {
 		max_sectors = min(max_sectors,
 			blk_chunk_sectors_left(bio->bi_iter.bi_sector,
-					       q->limits.chunk_sectors));
+					       lim->chunk_sectors));
 	}
 
 	start = bio->bi_iter.bi_sector & (pbs - 1);
@@ -181,11 +184,10 @@ static inline unsigned get_max_io_size(struct request_queue *q,
 	return max_sectors & ~(lbs - 1);
 }
 
-static inline unsigned get_max_segment_size(const struct request_queue *q,
-					    struct page *start_page,
-					    unsigned long offset)
+static inline unsigned get_max_segment_size(struct queue_limits *lim,
+		struct page *start_page, unsigned long offset)
 {
-	unsigned long mask = queue_segment_boundary(q);
+	unsigned long mask = lim->seg_boundary_mask;
 
 	offset = mask & (page_to_phys(start_page) + offset);
 
@@ -194,12 +196,12 @@ static inline unsigned get_max_segment_size(const struct request_queue *q,
 	 * on 32bit arch, use queue's max segment size when that happens.
 	 */
 	return min_not_zero(mask - offset + 1,
-			(unsigned long)queue_max_segment_size(q));
+			(unsigned long)lim->max_segment_size);
 }
 
 /**
  * bvec_split_segs - verify whether or not a bvec should be split in the middle
- * @q:        [in] request queue associated with the bio associated with @bv
+ * @lim:      [in] queue limits to split based on
  * @bv:       [in] bvec to examine
  * @nsegs:    [in,out] Number of segments in the bio being built. Incremented
  *            by the number of segments from @bv that may be appended to that
@@ -217,10 +219,9 @@ static inline unsigned get_max_segment_size(const struct request_queue *q,
  * *@nsegs segments and *@sectors sectors would make that bio unacceptable for
  * the block driver.
  */
-static bool bvec_split_segs(const struct request_queue *q,
-			    const struct bio_vec *bv, unsigned *nsegs,
-			    unsigned *bytes, unsigned max_segs,
-			    unsigned max_bytes)
+static bool bvec_split_segs(struct queue_limits *lim, const struct bio_vec *bv,
+		unsigned *nsegs, unsigned *bytes, unsigned max_segs,
+		unsigned max_bytes)
 {
 	unsigned max_len = min(max_bytes, UINT_MAX) - *bytes;
 	unsigned len = min(bv->bv_len, max_len);
@@ -228,7 +229,7 @@ static bool bvec_split_segs(const struct request_queue *q,
 	unsigned seg_size = 0;
 
 	while (len && *nsegs < max_segs) {
-		seg_size = get_max_segment_size(q, bv->bv_page,
+		seg_size = get_max_segment_size(lim, bv->bv_page,
 						bv->bv_offset + total_len);
 		seg_size = min(seg_size, len);
 
@@ -236,7 +237,7 @@ static bool bvec_split_segs(const struct request_queue *q,
 		total_len += seg_size;
 		len -= seg_size;
 
-		if ((bv->bv_offset + total_len) & queue_virt_boundary(q))
+		if ((bv->bv_offset + total_len) & lim->virt_boundary_mask)
 			break;
 	}
 
@@ -247,16 +248,17 @@ static bool bvec_split_segs(const struct request_queue *q,
 }
 
 /**
- * blk_bio_segment_split - split a bio in two bios
- * @q:    [in] request queue pointer
+ * bio_split_rw - split a bio in two bios
  * @bio:  [in] bio to be split
- * @bs:	  [in] bio set to allocate the clone from
+ * @lim:  [in] queue limits to split based on
  * @segs: [out] number of segments in the bio with the first half of the sectors
+ * @bs:	  [in] bio set to allocate the clone from
+ * @max_bytes: [in] maximum number of bytes per bio
  *
  * Clone @bio, update the bi_iter of the clone to represent the first sectors
  * of @bio and update @bio->bi_iter to represent the remaining sectors. The
  * following is guaranteed for the cloned bio:
- * - That it has at most get_max_io_size(@q, @bio) sectors.
+ * - That it has at most @max_bytes worth of data
  * - That it has at most queue_max_segments(@q) segments.
  *
  * Except for discard requests the cloned bio will point at the bi_io_vec of
@@ -265,33 +267,30 @@ static bool bvec_split_segs(const struct request_queue *q,
  * responsible for ensuring that @bs is only destroyed after processing of the
  * split bio has finished.
  */
-static struct bio *blk_bio_segment_split(struct request_queue *q,
-					 struct bio *bio,
-					 struct bio_set *bs,
-					 unsigned *segs)
+static struct bio *bio_split_rw(struct bio *bio, struct queue_limits *lim,
+		unsigned *segs, struct bio_set *bs, unsigned max_bytes)
 {
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
 	struct bvec_iter iter;
 	unsigned nsegs = 0, bytes = 0;
-	const unsigned max_bytes = get_max_io_size(q, bio) << 9;
-	const unsigned max_segs = queue_max_segments(q);
 
 	bio_for_each_bvec(bv, bio, iter) {
 		/*
 		 * If the queue doesn't support SG gaps and adding this
 		 * offset would create a gap, disallow it.
 		 */
-		if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
+		if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
 			goto split;
 
-		if (nsegs < max_segs &&
+		if (nsegs < lim->max_segments &&
 		    bytes + bv.bv_len <= max_bytes &&
 		    bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
 			nsegs++;
 			bytes += bv.bv_len;
-		} else if (bvec_split_segs(q, &bv, &nsegs, &bytes, max_segs,
-					   max_bytes)) {
-			goto split;
+		} else {
+			if (bvec_split_segs(lim, &bv, &nsegs, &bytes,
+					lim->max_segments, max_bytes))
+				goto split;
 		}
 
 		bvprv = bv;
@@ -308,7 +307,7 @@ split:
 	 * split size so that each bio is properly block size aligned, even if
 	 * we do not use the full hardware limits.
 	 */
-	bytes = ALIGN_DOWN(bytes, queue_logical_block_size(q));
+	bytes = ALIGN_DOWN(bytes, lim->logical_block_size);
 
 	/*
 	 * Bio splitting may cause subtle trouble such as hang when doing sync
@@ -320,34 +319,35 @@ split:
 }
 
 /**
- * __blk_queue_split - split a bio and submit the second half
- * @q:       [in] request_queue new bio is being queued at
- * @bio:     [in, out] bio to be split
- * @nr_segs: [out] number of segments in the first bio
+ * __bio_split_to_limits - split a bio to fit the queue limits
+ * @bio:     bio to be split
+ * @lim:     queue limits to split based on
+ * @nr_segs: returns the number of segments in the returned bio
  *
- * Split a bio into two bios, chain the two bios, submit the second half and
- * store a pointer to the first half in *@bio. If the second bio is still too
- * big it will be split by a recursive call to this function. Since this
- * function may allocate a new bio from q->bio_split, it is the responsibility
- * of the caller to ensure that q->bio_split is only released after processing
- * of the split bio has finished.
+ * Check if @bio needs splitting based on the queue limits, and if so split off
+ * a bio fitting the limits from the beginning of @bio and return it.  @bio is
+ * shortened to the remainder and re-submitted.
+ *
+ * The split bio is allocated from @q->bio_split, which is provided by the
+ * block layer.
  */
-void __blk_queue_split(struct request_queue *q, struct bio **bio,
+struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
 		       unsigned int *nr_segs)
 {
-	struct bio *split = NULL;
+	struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split;
+	struct bio *split;
 
-	switch (bio_op(*bio)) {
+	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
-		split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs);
+		split = bio_split_discard(bio, lim, nr_segs, bs);
 		break;
 	case REQ_OP_WRITE_ZEROES:
-		split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split,
-				nr_segs);
+		split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
 		break;
 	default:
-		split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
+		split = bio_split_rw(bio, lim, nr_segs, bs,
+				get_max_io_size(bio, lim) << SECTOR_SHIFT);
 		break;
 	}
 
@@ -356,32 +356,35 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio,
 		split->bi_opf |= REQ_NOMERGE;
 
 		blkcg_bio_issue_init(split);
-		bio_chain(split, *bio);
-		trace_block_split(split, (*bio)->bi_iter.bi_sector);
-		submit_bio_noacct(*bio);
-		*bio = split;
+		bio_chain(split, bio);
+		trace_block_split(split, bio->bi_iter.bi_sector);
+		submit_bio_noacct(bio);
+		return split;
 	}
+	return bio;
 }
 
 /**
- * blk_queue_split - split a bio and submit the second half
- * @bio: [in, out] bio to be split
+ * bio_split_to_limits - split a bio to fit the queue limits
+ * @bio:     bio to be split
  *
- * Split a bio into two bios, chains the two bios, submit the second half and
- * store a pointer to the first half in *@bio. Since this function may allocate
- * a new bio from q->bio_split, it is the responsibility of the caller to ensure
- * that q->bio_split is only released after processing of the split bio has
- * finished.
+ * Check if @bio needs splitting based on the queue limits of @bio->bi_bdev, and
+ * if so split off a bio fitting the limits from the beginning of @bio and
+ * return it.  @bio is shortened to the remainder and re-submitted.
+ *
+ * The split bio is allocated from @q->bio_split, which is provided by the
+ * block layer.
  */
-void blk_queue_split(struct bio **bio)
+struct bio *bio_split_to_limits(struct bio *bio)
 {
-	struct request_queue *q = bdev_get_queue((*bio)->bi_bdev);
+	struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
 	unsigned int nr_segs;
 
-	if (blk_may_split(q, *bio))
-		__blk_queue_split(q, bio, &nr_segs);
+	if (bio_may_exceed_limits(bio, lim))
+		return __bio_split_to_limits(bio, lim, &nr_segs);
+	return bio;
 }
-EXPORT_SYMBOL(blk_queue_split);
+EXPORT_SYMBOL(bio_split_to_limits);
 
 unsigned int blk_recalc_rq_segments(struct request *rq)
 {
@@ -411,7 +414,7 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
 	}
 
 	rq_for_each_bvec(bv, rq, iter)
-		bvec_split_segs(rq->q, &bv, &nr_phys_segs, &bytes,
+		bvec_split_segs(&rq->q->limits, &bv, &nr_phys_segs, &bytes,
 				UINT_MAX, UINT_MAX);
 	return nr_phys_segs;
 }
@@ -442,8 +445,8 @@ static unsigned blk_bvec_map_sg(struct request_queue *q,
 
 	while (nbytes > 0) {
 		unsigned offset = bvec->bv_offset + total;
-		unsigned len = min(get_max_segment_size(q, bvec->bv_page,
-					offset), nbytes);
+		unsigned len = min(get_max_segment_size(&q->limits,
+				   bvec->bv_page, offset), nbytes);
 		struct page *page = bvec->bv_page;
 
 		/*
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 70177ee74295..5ee62b95f3e5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2815,9 +2815,9 @@ void blk_mq_submit_bio(struct bio *bio)
 	unsigned int nr_segs = 1;
 	blk_status_t ret;
 
-	blk_queue_bounce(q, &bio);
-	if (blk_may_split(q, bio))
-		__blk_queue_split(q, &bio, &nr_segs);
+	bio = blk_queue_bounce(bio, q);
+	if (bio_may_exceed_limits(bio, &q->limits))
+		bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
 
 	if (!bio_integrity_prep(bio))
 		return;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index c0303026752d..e1f009aba6fd 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -779,8 +779,6 @@ static void blk_release_queue(struct kobject *kobj)
 	if (queue_is_mq(q))
 		blk_mq_release(q);
 
-	bioset_exit(&q->bio_split);
-
 	if (blk_queue_has_srcu(q))
 		cleanup_srcu_struct(q->srcu);
 
diff --git a/block/blk.h b/block/blk.h
index 1d83b1d41cd1..d7142c4d2fef 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -97,23 +97,23 @@ static inline bool biovec_phys_mergeable(struct request_queue *q,
 	return true;
 }
 
-static inline bool __bvec_gap_to_prev(struct request_queue *q,
+static inline bool __bvec_gap_to_prev(struct queue_limits *lim,
 		struct bio_vec *bprv, unsigned int offset)
 {
-	return (offset & queue_virt_boundary(q)) ||
-		((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
+	return (offset & lim->virt_boundary_mask) ||
+		((bprv->bv_offset + bprv->bv_len) & lim->virt_boundary_mask);
 }
 
 /*
  * Check if adding a bio_vec after bprv with offset would create a gap in
  * the SG list. Most drivers don't care about this, but some do.
  */
-static inline bool bvec_gap_to_prev(struct request_queue *q,
+static inline bool bvec_gap_to_prev(struct queue_limits *lim,
 		struct bio_vec *bprv, unsigned int offset)
 {
-	if (!queue_virt_boundary(q))
+	if (!lim->virt_boundary_mask)
 		return false;
-	return __bvec_gap_to_prev(q, bprv, offset);
+	return __bvec_gap_to_prev(lim, bprv, offset);
 }
 
 static inline bool rq_mergeable(struct request *rq)
@@ -189,7 +189,8 @@ static inline bool integrity_req_gap_back_merge(struct request *req,
 	struct bio_integrity_payload *bip = bio_integrity(req->bio);
 	struct bio_integrity_payload *bip_next = bio_integrity(next);
 
-	return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
+	return bvec_gap_to_prev(&req->q->limits,
+				&bip->bip_vec[bip->bip_vcnt - 1],
 				bip_next->bip_vec[0].bv_offset);
 }
 
@@ -199,7 +200,8 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
 	struct bio_integrity_payload *bip = bio_integrity(bio);
 	struct bio_integrity_payload *bip_next = bio_integrity(req->bio);
 
-	return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
+	return bvec_gap_to_prev(&req->q->limits,
+				&bip->bip_vec[bip->bip_vcnt - 1],
 				bip_next->bip_vec[0].bv_offset);
 }
 
@@ -288,7 +290,8 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
 ssize_t part_timeout_store(struct device *, struct device_attribute *,
 				const char *, size_t);
 
-static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
+static inline bool bio_may_exceed_limits(struct bio *bio,
+		struct queue_limits *lim)
 {
 	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
@@ -307,12 +310,12 @@ static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
 	 * to the performance impact of cloned bios themselves the loop below
 	 * doesn't matter anyway.
 	 */
-	return q->limits.chunk_sectors || bio->bi_vcnt != 1 ||
+	return lim->chunk_sectors || bio->bi_vcnt != 1 ||
 		bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
 }
 
-void __blk_queue_split(struct request_queue *q, struct bio **bio,
-			unsigned int *nr_segs);
+struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
+		       unsigned int *nr_segs);
 int ll_back_merge_fn(struct request *req, struct bio *bio,
 		unsigned int nr_segs);
 bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@@ -344,16 +347,6 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
 		q->last_merge = NULL;
 }
 
-/*
- * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size
- * is defined as 'unsigned int', meantime it has to aligned to with logical
- * block size which is the minimum accepted unit by hardware.
- */
-static inline unsigned int bio_allowed_max_sectors(struct request_queue *q)
-{
-	return round_down(UINT_MAX, queue_logical_block_size(q)) >> 9;
-}
-
 /*
  * Internal io_context interface
  */
@@ -378,7 +371,7 @@ static inline void blk_throtl_bio_endio(struct bio *bio) { }
 static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
 #endif
 
-void __blk_queue_bounce(struct request_queue *q, struct bio **bio);
+struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q);
 
 static inline bool blk_queue_may_bounce(struct request_queue *q)
 {
@@ -387,10 +380,12 @@ static inline bool blk_queue_may_bounce(struct request_queue *q)
 		max_low_pfn >= max_pfn;
 }
 
-static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
+static inline struct bio *blk_queue_bounce(struct bio *bio,
+		struct request_queue *q)
 {
-	if (unlikely(blk_queue_may_bounce(q) && bio_has_data(*bio)))
-		__blk_queue_bounce(q, bio);	
+	if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio)))
+		return __blk_queue_bounce(bio, q);
+	return bio;
 }
 
 #ifdef CONFIG_BLK_CGROUP_IOLATENCY
diff --git a/block/bounce.c b/block/bounce.c
index c8f487af7be3..7cfcb242f9a1 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -199,24 +199,24 @@ err_put:
 	return NULL;
 }
 
-void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
+struct bio *__blk_queue_bounce(struct bio *bio_orig, struct request_queue *q)
 {
 	struct bio *bio;
-	int rw = bio_data_dir(*bio_orig);
+	int rw = bio_data_dir(bio_orig);
 	struct bio_vec *to, from;
 	struct bvec_iter iter;
 	unsigned i = 0, bytes = 0;
 	bool bounce = false;
 	int sectors;
 
-	bio_for_each_segment(from, *bio_orig, iter) {
+	bio_for_each_segment(from, bio_orig, iter) {
 		if (i++ < BIO_MAX_VECS)
 			bytes += from.bv_len;
 		if (PageHighMem(from.bv_page))
 			bounce = true;
 	}
 	if (!bounce)
-		return;
+		return bio_orig;
 
 	/*
 	 * Individual bvecs might not be logical block aligned. Round down
@@ -225,13 +225,13 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	 */
 	sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >>
 			SECTOR_SHIFT;
-	if (sectors < bio_sectors(*bio_orig)) {
-		bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
-		bio_chain(bio, *bio_orig);
-		submit_bio_noacct(*bio_orig);
-		*bio_orig = bio;
+	if (sectors < bio_sectors(bio_orig)) {
+		bio = bio_split(bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
+		bio_chain(bio, bio_orig);
+		submit_bio_noacct(bio_orig);
+		bio_orig = bio;
 	}
-	bio = bounce_clone_bio(*bio_orig);
+	bio = bounce_clone_bio(bio_orig);
 
 	/*
 	 * Bvec table can't be updated by bio_for_each_segment_all(),
@@ -254,7 +254,7 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 		to->bv_page = bounce_page;
 	}
 
-	trace_block_bio_bounce(*bio_orig);
+	trace_block_bio_bounce(bio_orig);
 
 	bio->bi_flags |= (1 << BIO_BOUNCED);
 
@@ -263,6 +263,6 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	else
 		bio->bi_end_io = bounce_end_io_write;
 
-	bio->bi_private = *bio_orig;
-	*bio_orig = bio;
+	bio->bi_private = bio_orig;
+	return bio;
 }
diff --git a/block/genhd.c b/block/genhd.c
index e1d5b10ac193..b901fea1d55a 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1151,6 +1151,7 @@ static void disk_release(struct device *dev)
 		blk_mq_exit_queue(disk->queue);
 
 	blkcg_exit_queue(disk->queue);
+	bioset_exit(&disk->bio_split);
 
 	disk_release_events(disk);
 	kfree(disk->random);
@@ -1342,9 +1343,12 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 	if (!disk)
 		goto out_put_queue;
 
+	if (bioset_init(&disk->bio_split, BIO_POOL_SIZE, 0, 0))
+		goto out_free_disk;
+
 	disk->bdi = bdi_alloc(node_id);
 	if (!disk->bdi)
-		goto out_free_disk;
+		goto out_free_bioset;
 
 	/* bdev_alloc() might need the queue, set before the first call */
 	disk->queue = q;
@@ -1382,6 +1386,8 @@ out_destroy_part_tbl:
 	iput(disk->part0->bd_inode);
 out_free_bdi:
 	bdi_put(disk->bdi);
+out_free_bioset:
+	bioset_exit(&disk->bio_split);
 out_free_disk:
 	kfree(disk);
 out_put_queue:
diff --git a/crypto/kpp.c b/crypto/kpp.c
index 7aa6ba4b60a4..678e871ce418 100644
--- a/crypto/kpp.c
+++ b/crypto/kpp.c
@@ -104,6 +104,12 @@ int crypto_grab_kpp(struct crypto_kpp_spawn *spawn,
 }
 EXPORT_SYMBOL_GPL(crypto_grab_kpp);
 
+int crypto_has_kpp(const char *alg_name, u32 type, u32 mask)
+{
+	return crypto_type_has_alg(alg_name, &crypto_kpp_type, type, mask);
+}
+EXPORT_SYMBOL_GPL(crypto_has_kpp);
+
 static void kpp_prepare_alg(struct kpp_alg *alg)
 {
 	struct crypto_alg *base = &alg->base;
diff --git a/crypto/shash.c b/crypto/shash.c
index 0a0a50cb694f..4c88e63b3350 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -521,6 +521,12 @@ struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
 }
 EXPORT_SYMBOL_GPL(crypto_alloc_shash);
 
+int crypto_has_shash(const char *alg_name, u32 type, u32 mask)
+{
+	return crypto_type_has_alg(alg_name, &crypto_shash_type, type, mask);
+}
+EXPORT_SYMBOL_GPL(crypto_has_shash);
+
 static int shash_prepare_alg(struct shash_alg *alg)
 {
 	struct crypto_alg *base = &alg->base;
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index e19fcab016ba..db1b4b202646 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -248,15 +248,6 @@ config BLK_DEV_NBD
 
 	  If unsure, say N.
 
-config BLK_DEV_SX8
-	tristate "Promise SATA SX8 support"
-	depends on PCI
-	help
-	  Saying Y or M here will enable support for the 
-	  Promise SATA SX8 controllers.
-
-	  Use devices /dev/sx8/$N and /dev/sx8/$Np$M.
-
 config BLK_DEV_RAM
 	tristate "RAM block device support"
 	help
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index be631352567e..101612cba303 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -26,8 +26,6 @@ obj-$(CONFIG_SUNVDC)		+= sunvdc.o
 obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o
 obj-$(CONFIG_VIRTIO_BLK)	+= virtio_blk.o
 
-obj-$(CONFIG_BLK_DEV_SX8)	+= sx8.o
-
 obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= xen-blkfront.o
 obj-$(CONFIG_XEN_BLKDEV_BACKEND)	+= xen-blkback/
 obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 603f6828dd79..7d9db33363de 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -974,25 +974,58 @@ static void drbd_bm_endio(struct bio *bio)
 	}
 }
 
+/* For the layout, see comment above drbd_md_set_sector_offsets(). */
+static inline sector_t drbd_md_last_bitmap_sector(struct drbd_backing_dev *bdev)
+{
+	switch (bdev->md.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		return bdev->md.md_offset + bdev->md.al_offset -1;
+	case DRBD_MD_INDEX_FLEX_EXT:
+	default:
+		return bdev->md.md_offset + bdev->md.md_size_sect -1;
+	}
+}
+
 static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
 {
 	struct drbd_device *device = ctx->device;
 	enum req_op op = ctx->flags & BM_AIO_READ ? REQ_OP_READ : REQ_OP_WRITE;
-	struct bio *bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op,
-					   GFP_NOIO, &drbd_md_io_bio_set);
 	struct drbd_bitmap *b = device->bitmap;
+	struct bio *bio;
 	struct page *page;
+	sector_t last_bm_sect;
+	sector_t first_bm_sect;
+	sector_t on_disk_sector;
 	unsigned int len;
 
-	sector_t on_disk_sector =
-		device->ldev->md.md_offset + device->ldev->md.bm_offset;
-	on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
+	first_bm_sect = device->ldev->md.md_offset + device->ldev->md.bm_offset;
+	on_disk_sector = first_bm_sect + (((sector_t)page_nr) << (PAGE_SHIFT-SECTOR_SHIFT));
 
 	/* this might happen with very small
 	 * flexible external meta data device,
 	 * or with PAGE_SIZE > 4k */
-	len = min_t(unsigned int, PAGE_SIZE,
-		(drbd_md_last_sector(device->ldev) - on_disk_sector + 1)<<9);
+	last_bm_sect = drbd_md_last_bitmap_sector(device->ldev);
+	if (first_bm_sect <= on_disk_sector && last_bm_sect >= on_disk_sector) {
+		sector_t len_sect = last_bm_sect - on_disk_sector + 1;
+		if (len_sect < PAGE_SIZE/SECTOR_SIZE)
+			len = (unsigned int)len_sect*SECTOR_SIZE;
+		else
+			len = PAGE_SIZE;
+	} else {
+		if (__ratelimit(&drbd_ratelimit_state)) {
+			drbd_err(device, "Invalid offset during on-disk bitmap access: "
+				 "page idx %u, sector %llu\n", page_nr, on_disk_sector);
+		}
+		ctx->error = -EIO;
+		bm_set_page_io_err(b->bm_pages[page_nr]);
+		if (atomic_dec_and_test(&ctx->in_flight)) {
+			ctx->done = 1;
+			wake_up(&device->misc_wait);
+			kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
+		}
+		return;
+	}
 
 	/* serialize IO on this page */
 	bm_page_lock_io(device, page_nr);
@@ -1007,6 +1040,8 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho
 		bm_store_page_idx(page, page_nr);
 	} else
 		page = b->bm_pages[page_nr];
+	bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op, GFP_NOIO,
+			&drbd_md_io_bio_set);
 	bio->bi_iter.bi_sector = on_disk_sector;
 	/* bio_add_page of a single page to an empty bio will always succeed,
 	 * according to api.  Do we want to assert that? */
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 6d8dd14458c6..8f7f144e54f3 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1608,7 +1608,7 @@ void drbd_submit_bio(struct bio *bio)
 {
 	struct drbd_device *device = bio->bi_bdev->bd_disk->private_data;
 
-	blk_queue_split(&bio);
+	bio = bio_split_to_limits(bio);
 
 	/*
 	 * what we "blindly" assume:
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index f5d098a148cb..2a709daefbc4 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -11,6 +11,8 @@
  * (part of code stolen from loop.c)
  */
 
+#define pr_fmt(fmt) "nbd: " fmt
+
 #include <linux/major.h>
 
 #include <linux/blkdev.h>
@@ -1950,7 +1952,7 @@ again:
 			     test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) ||
 			    !refcount_inc_not_zero(&nbd->refs)) {
 				mutex_unlock(&nbd_index_mutex);
-				pr_err("nbd: device at index %d is going down\n",
+				pr_err("device at index %d is going down\n",
 					index);
 				return -EINVAL;
 			}
@@ -1961,7 +1963,7 @@ again:
 	if (!nbd) {
 		nbd = nbd_dev_add(index, 2);
 		if (IS_ERR(nbd)) {
-			pr_err("nbd: failed to add new device\n");
+			pr_err("failed to add new device\n");
 			return PTR_ERR(nbd);
 		}
 	}
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 8b224ede2e33..c451c477978f 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -201,6 +201,22 @@ static bool g_use_per_node_hctx;
 module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444);
 MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
 
+static bool g_memory_backed;
+module_param_named(memory_backed, g_memory_backed, bool, 0444);
+MODULE_PARM_DESC(memory_backed, "Create a memory-backed block device. Default: false");
+
+static bool g_discard;
+module_param_named(discard, g_discard, bool, 0444);
+MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed null_blk device). Default: false");
+
+static unsigned long g_cache_size;
+module_param_named(cache_size, g_cache_size, ulong, 0444);
+MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)");
+
+static unsigned int g_mbps;
+module_param_named(mbps, g_mbps, uint, 0444);
+MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)");
+
 static bool g_zoned;
 module_param_named(zoned, g_zoned, bool, S_IRUGO);
 MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false");
@@ -409,6 +425,8 @@ NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
 NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
 NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
 NULLB_DEVICE_ATTR(virt_boundary, bool, NULL);
+NULLB_DEVICE_ATTR(no_sched, bool, NULL);
+NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL);
 
 static ssize_t nullb_device_power_show(struct config_item *item, char *page)
 {
@@ -532,6 +550,8 @@ static struct configfs_attribute *nullb_device_attrs[] = {
 	&nullb_device_attr_zone_max_open,
 	&nullb_device_attr_zone_max_active,
 	&nullb_device_attr_virt_boundary,
+	&nullb_device_attr_no_sched,
+	&nullb_device_attr_shared_tag_bitmap,
 	NULL,
 };
 
@@ -588,7 +608,13 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)
 static ssize_t memb_group_features_show(struct config_item *item, char *page)
 {
 	return snprintf(page, PAGE_SIZE,
-			"memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active,blocksize,max_sectors,virt_boundary\n");
+			"badblocks,blocking,blocksize,cache_size,"
+			"completion_nsec,discard,home_node,hw_queue_depth,"
+			"irqmode,max_sectors,mbps,memory_backed,no_sched,"
+			"poll_queues,power,queue_mode,shared_tag_bitmap,size,"
+			"submit_queues,use_per_node_hctx,virt_boundary,zoned,"
+			"zone_capacity,zone_max_active,zone_max_open,"
+			"zone_nr_conv,zone_size\n");
 }
 
 CONFIGFS_ATTR_RO(memb_group_, features);
@@ -650,6 +676,10 @@ static struct nullb_device *null_alloc_dev(void)
 	dev->irqmode = g_irqmode;
 	dev->hw_queue_depth = g_hw_queue_depth;
 	dev->blocking = g_blocking;
+	dev->memory_backed = g_memory_backed;
+	dev->discard = g_discard;
+	dev->cache_size = g_cache_size;
+	dev->mbps = g_mbps;
 	dev->use_per_node_hctx = g_use_per_node_hctx;
 	dev->zoned = g_zoned;
 	dev->zone_size = g_zone_size;
@@ -658,6 +688,8 @@ static struct nullb_device *null_alloc_dev(void)
 	dev->zone_max_open = g_zone_max_open;
 	dev->zone_max_active = g_zone_max_active;
 	dev->virt_boundary = g_virt_boundary;
+	dev->no_sched = g_no_sched;
+	dev->shared_tag_bitmap = g_shared_tag_bitmap;
 	return dev;
 }
 
@@ -1655,7 +1687,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 static void cleanup_queue(struct nullb_queue *nq)
 {
-	kfree(nq->tag_map);
+	bitmap_free(nq->tag_map);
 	kfree(nq->cmds);
 }
 
@@ -1782,14 +1814,13 @@ static const struct block_device_operations null_rq_ops = {
 static int setup_commands(struct nullb_queue *nq)
 {
 	struct nullb_cmd *cmd;
-	int i, tag_size;
+	int i;
 
 	nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL);
 	if (!nq->cmds)
 		return -ENOMEM;
 
-	tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG;
-	nq->tag_map = kcalloc(tag_size, sizeof(unsigned long), GFP_KERNEL);
+	nq->tag_map = bitmap_zalloc(nq->queue_depth, GFP_KERNEL);
 	if (!nq->tag_map) {
 		kfree(nq->cmds);
 		return -ENOMEM;
@@ -1866,31 +1897,48 @@ static int null_gendisk_register(struct nullb *nullb)
 
 static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
 {
+	unsigned int flags = BLK_MQ_F_SHOULD_MERGE;
+	int hw_queues, numa_node;
+	unsigned int queue_depth;
 	int poll_queues;
 
-	set->ops = &null_mq_ops;
-	set->nr_hw_queues = nullb ? nullb->dev->submit_queues :
-						g_submit_queues;
-	poll_queues = nullb ? nullb->dev->poll_queues : g_poll_queues;
-	if (poll_queues)
-		set->nr_hw_queues += poll_queues;
-	set->queue_depth = nullb ? nullb->dev->hw_queue_depth :
-						g_hw_queue_depth;
-	set->numa_node = nullb ? nullb->dev->home_node : g_home_node;
-	set->cmd_size	= sizeof(struct nullb_cmd);
-	set->flags = BLK_MQ_F_SHOULD_MERGE;
-	if (g_no_sched)
-		set->flags |= BLK_MQ_F_NO_SCHED;
-	if (g_shared_tag_bitmap)
-		set->flags |= BLK_MQ_F_TAG_HCTX_SHARED;
-	set->driver_data = nullb;
-	if (poll_queues)
-		set->nr_maps = 3;
-	else
-		set->nr_maps = 1;
+	if (nullb) {
+		hw_queues = nullb->dev->submit_queues;
+		poll_queues = nullb->dev->poll_queues;
+		queue_depth = nullb->dev->hw_queue_depth;
+		numa_node = nullb->dev->home_node;
+		if (nullb->dev->no_sched)
+			flags |= BLK_MQ_F_NO_SCHED;
+		if (nullb->dev->shared_tag_bitmap)
+			flags |= BLK_MQ_F_TAG_HCTX_SHARED;
+		if (nullb->dev->blocking)
+			flags |= BLK_MQ_F_BLOCKING;
+	} else {
+		hw_queues = g_submit_queues;
+		poll_queues = g_poll_queues;
+		queue_depth = g_hw_queue_depth;
+		numa_node = g_home_node;
+		if (g_no_sched)
+			flags |= BLK_MQ_F_NO_SCHED;
+		if (g_shared_tag_bitmap)
+			flags |= BLK_MQ_F_TAG_HCTX_SHARED;
+		if (g_blocking)
+			flags |= BLK_MQ_F_BLOCKING;
+	}
 
-	if ((nullb && nullb->dev->blocking) || g_blocking)
-		set->flags |= BLK_MQ_F_BLOCKING;
+	set->ops = &null_mq_ops;
+	set->cmd_size	= sizeof(struct nullb_cmd);
+	set->flags = flags;
+	set->driver_data = nullb;
+	set->nr_hw_queues = hw_queues;
+	set->queue_depth = queue_depth;
+	set->numa_node = numa_node;
+	if (poll_queues) {
+		set->nr_hw_queues += poll_queues;
+		set->nr_maps = 3;
+	} else {
+		set->nr_maps = 1;
+	}
 
 	return blk_mq_alloc_tag_set(set);
 }
@@ -2042,8 +2090,13 @@ static int null_add_dev(struct nullb_device *dev)
 	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q);
 
 	mutex_lock(&lock);
-	nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
-	dev->index = nullb->index;
+	rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
+	if (rv < 0) {
+		mutex_unlock(&lock);
+		goto out_cleanup_zone;
+	}
+	nullb->index = rv;
+	dev->index = rv;
 	mutex_unlock(&lock);
 
 	blk_queue_logical_block_size(nullb->q, dev->blocksize);
@@ -2069,7 +2122,7 @@ static int null_add_dev(struct nullb_device *dev)
 
 	rv = null_gendisk_register(nullb);
 	if (rv)
-		goto out_cleanup_zone;
+		goto out_ida_free;
 
 	mutex_lock(&lock);
 	list_add_tail(&nullb->list, &nullb_list);
@@ -2078,6 +2131,9 @@ static int null_add_dev(struct nullb_device *dev)
 	pr_info("disk %s created\n", nullb->disk_name);
 
 	return 0;
+
+out_ida_free:
+	ida_free(&nullb_indexes, nullb->index);
 out_cleanup_zone:
 	null_free_zoned_dev(dev);
 out_cleanup_disk:
diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h
index 6fbf0a1b2622..94ff68052b1e 100644
--- a/drivers/block/null_blk/null_blk.h
+++ b/drivers/block/null_blk/null_blk.h
@@ -113,6 +113,8 @@ struct nullb_device {
 	bool discard; /* if support discard */
 	bool zoned; /* if device is zoned */
 	bool virt_boundary; /* virtual boundary on/off for the device */
+	bool no_sched; /* no IO scheduler for the device */
+	bool shared_tag_bitmap; /* use hostwide shared tags */
 };
 
 struct nullb {
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 01a15dbd9cde..4cea3b08087e 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2399,7 +2399,7 @@ static void pkt_submit_bio(struct bio *bio)
 	struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata;
 	struct bio *split;
 
-	blk_queue_split(&bio);
+	bio = bio_split_to_limits(bio);
 
 	pkt_dbg(2, pd, "start = %6llx stop = %6llx\n",
 		(unsigned long long)bio->bi_iter.bi_sector,
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index d1e0fefec90b..e1d080f680ed 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -586,7 +586,7 @@ static void ps3vram_submit_bio(struct bio *bio)
 
 	dev_dbg(&dev->core, "%s\n", __func__);
 
-	blk_queue_split(&bio);
+	bio = bio_split_to_limits(bio);
 
 	spin_lock_irq(&priv->lock);
 	busy = !bio_list_empty(&priv->list);
diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c
index 2be5d87a3ca6..e7c7d9a68168 100644
--- a/drivers/block/rnbd/rnbd-clt-sysfs.c
+++ b/drivers/block/rnbd/rnbd-clt-sysfs.c
@@ -376,7 +376,7 @@ static ssize_t rnbd_clt_resize_dev_store(struct kobject *kobj,
 	if (ret)
 		return ret;
 
-	ret = rnbd_clt_resize_disk(dev, (size_t)sectors);
+	ret = rnbd_clt_resize_disk(dev, sectors);
 	if (ret)
 		return ret;
 
diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
index b8d9e2824d9c..04da33a22ef4 100644
--- a/drivers/block/rnbd/rnbd-clt.c
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -68,39 +68,18 @@ static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev)
 	return refcount_inc_not_zero(&dev->refcount);
 }
 
-static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev,
-				 const struct rnbd_msg_open_rsp *rsp)
+static void rnbd_clt_change_capacity(struct rnbd_clt_dev *dev,
+				    sector_t new_nsectors)
 {
-	struct rnbd_clt_session *sess = dev->sess;
+	if (get_capacity(dev->gd) == new_nsectors)
+		return;
 
-	if (!rsp->logical_block_size)
-		return -EINVAL;
-
-	dev->device_id		    = le32_to_cpu(rsp->device_id);
-	dev->nsectors		    = le64_to_cpu(rsp->nsectors);
-	dev->logical_block_size	    = le16_to_cpu(rsp->logical_block_size);
-	dev->physical_block_size    = le16_to_cpu(rsp->physical_block_size);
-	dev->max_discard_sectors    = le32_to_cpu(rsp->max_discard_sectors);
-	dev->discard_granularity    = le32_to_cpu(rsp->discard_granularity);
-	dev->discard_alignment	    = le32_to_cpu(rsp->discard_alignment);
-	dev->secure_discard	    = le16_to_cpu(rsp->secure_discard);
-	dev->wc			    = !!(rsp->cache_policy & RNBD_WRITEBACK);
-	dev->fua		    = !!(rsp->cache_policy & RNBD_FUA);
-
-	dev->max_hw_sectors = sess->max_io_size / SECTOR_SIZE;
-	dev->max_segments = sess->max_segments;
-
-	return 0;
-}
-
-static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev,
-				    size_t new_nsectors)
-{
-	rnbd_clt_info(dev, "Device size changed from %zu to %zu sectors\n",
-		       dev->nsectors, new_nsectors);
-	dev->nsectors = new_nsectors;
-	set_capacity_and_notify(dev->gd, dev->nsectors);
-	return 0;
+	/*
+	 * If the size changed, we need to revalidate it
+	 */
+	rnbd_clt_info(dev, "Device size changed from %llu to %llu sectors\n",
+		      get_capacity(dev->gd), new_nsectors);
+	set_capacity_and_notify(dev->gd, new_nsectors);
 }
 
 static int process_msg_open_rsp(struct rnbd_clt_dev *dev,
@@ -119,19 +98,16 @@ static int process_msg_open_rsp(struct rnbd_clt_dev *dev,
 	if (dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED) {
 		u64 nsectors = le64_to_cpu(rsp->nsectors);
 
-		/*
-		 * If the device was remapped and the size changed in the
-		 * meantime we need to revalidate it
-		 */
-		if (dev->nsectors != nsectors)
-			rnbd_clt_change_capacity(dev, nsectors);
+		rnbd_clt_change_capacity(dev, nsectors);
 		gd_kobj = &disk_to_dev(dev->gd)->kobj;
 		kobject_uevent(gd_kobj, KOBJ_ONLINE);
 		rnbd_clt_info(dev, "Device online, device remapped successfully\n");
 	}
-	err = rnbd_clt_set_dev_attr(dev, rsp);
-	if (err)
+	if (!rsp->logical_block_size) {
+		err = -EINVAL;
 		goto out;
+	}
+	dev->device_id = le32_to_cpu(rsp->device_id);
 	dev->dev_state = DEV_STATE_MAPPED;
 
 out:
@@ -140,7 +116,7 @@ out:
 	return err;
 }
 
-int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize)
+int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, sector_t newsize)
 {
 	int ret = 0;
 
@@ -150,7 +126,7 @@ int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize)
 		ret = -ENOENT;
 		goto out;
 	}
-	ret = rnbd_clt_change_capacity(dev, newsize);
+	rnbd_clt_change_capacity(dev, newsize);
 
 out:
 	mutex_unlock(&dev->lock);
@@ -507,6 +483,11 @@ static void msg_open_conf(struct work_struct *work)
 	struct rnbd_msg_open_rsp *rsp = iu->buf;
 	struct rnbd_clt_dev *dev = iu->dev;
 	int errno = iu->errno;
+	bool from_map = false;
+
+	/* INIT state is only triggered from rnbd_clt_map_device */
+	if (dev->dev_state == DEV_STATE_INIT)
+		from_map = true;
 
 	if (errno) {
 		rnbd_clt_err(dev,
@@ -523,7 +504,9 @@ static void msg_open_conf(struct work_struct *work)
 			send_msg_close(dev, device_id, RTRS_PERMIT_NOWAIT);
 		}
 	}
-	kfree(rsp);
+	/* We free rsp in rnbd_clt_map_device for map scenario */
+	if (!from_map)
+		kfree(rsp);
 	wake_up_iu_comp(iu, errno);
 	rnbd_put_iu(dev->sess, iu);
 	rnbd_clt_put_dev(dev);
@@ -942,7 +925,7 @@ static int rnbd_client_open(struct block_device *block_device, fmode_t mode)
 {
 	struct rnbd_clt_dev *dev = block_device->bd_disk->private_data;
 
-	if (dev->read_only && (mode & FMODE_WRITE))
+	if (get_disk_ro(dev->gd) && (mode & FMODE_WRITE))
 		return -EPERM;
 
 	if (dev->dev_state == DEV_STATE_UNMAPPED ||
@@ -963,10 +946,10 @@ static int rnbd_client_getgeo(struct block_device *block_device,
 			      struct hd_geometry *geo)
 {
 	u64 size;
-	struct rnbd_clt_dev *dev;
+	struct rnbd_clt_dev *dev = block_device->bd_disk->private_data;
+	struct queue_limits *limit = &dev->queue->limits;
 
-	dev = block_device->bd_disk->private_data;
-	size = dev->size * (dev->logical_block_size / SECTOR_SIZE);
+	size = dev->size * (limit->logical_block_size / SECTOR_SIZE);
 	geo->cylinders	= size >> 6;	/* size/64 */
 	geo->heads	= 4;
 	geo->sectors	= 16;
@@ -1350,11 +1333,15 @@ static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev)
 	}
 }
 
-static void setup_request_queue(struct rnbd_clt_dev *dev)
+static void setup_request_queue(struct rnbd_clt_dev *dev,
+				struct rnbd_msg_open_rsp *rsp)
 {
-	blk_queue_logical_block_size(dev->queue, dev->logical_block_size);
-	blk_queue_physical_block_size(dev->queue, dev->physical_block_size);
-	blk_queue_max_hw_sectors(dev->queue, dev->max_hw_sectors);
+	blk_queue_logical_block_size(dev->queue,
+				     le16_to_cpu(rsp->logical_block_size));
+	blk_queue_physical_block_size(dev->queue,
+				      le16_to_cpu(rsp->physical_block_size));
+	blk_queue_max_hw_sectors(dev->queue,
+				 dev->sess->max_io_size / SECTOR_SIZE);
 
 	/*
 	 * we don't support discards to "discontiguous" segments
@@ -1362,21 +1349,27 @@ static void setup_request_queue(struct rnbd_clt_dev *dev)
 	 */
 	blk_queue_max_discard_segments(dev->queue, 1);
 
-	blk_queue_max_discard_sectors(dev->queue, dev->max_discard_sectors);
-	dev->queue->limits.discard_granularity	= dev->discard_granularity;
-	dev->queue->limits.discard_alignment	= dev->discard_alignment;
-	if (dev->secure_discard)
+	blk_queue_max_discard_sectors(dev->queue,
+				      le32_to_cpu(rsp->max_discard_sectors));
+	dev->queue->limits.discard_granularity =
+					le32_to_cpu(rsp->discard_granularity);
+	dev->queue->limits.discard_alignment =
+					le32_to_cpu(rsp->discard_alignment);
+	if (le16_to_cpu(rsp->secure_discard))
 		blk_queue_max_secure_erase_sectors(dev->queue,
-				dev->max_discard_sectors);
+					le32_to_cpu(rsp->max_discard_sectors));
 	blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue);
 	blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue);
-	blk_queue_max_segments(dev->queue, dev->max_segments);
+	blk_queue_max_segments(dev->queue, dev->sess->max_segments);
 	blk_queue_io_opt(dev->queue, dev->sess->max_io_size);
 	blk_queue_virt_boundary(dev->queue, SZ_4K - 1);
-	blk_queue_write_cache(dev->queue, dev->wc, dev->fua);
+	blk_queue_write_cache(dev->queue,
+			      !!(rsp->cache_policy & RNBD_WRITEBACK),
+			      !!(rsp->cache_policy & RNBD_FUA));
 }
 
-static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
+static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev,
+				   struct rnbd_msg_open_rsp *rsp, int idx)
 {
 	int err;
 
@@ -1388,19 +1381,15 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
 	dev->gd->private_data	= dev;
 	snprintf(dev->gd->disk_name, sizeof(dev->gd->disk_name), "rnbd%d",
 		 idx);
-	pr_debug("disk_name=%s, capacity=%zu\n",
+	pr_debug("disk_name=%s, capacity=%llu\n",
 		 dev->gd->disk_name,
-		 dev->nsectors * (dev->logical_block_size / SECTOR_SIZE)
-		 );
+		 le64_to_cpu(rsp->nsectors) *
+		 (le16_to_cpu(rsp->logical_block_size) / SECTOR_SIZE));
 
-	set_capacity(dev->gd, dev->nsectors);
+	set_capacity(dev->gd, le64_to_cpu(rsp->nsectors));
 
-	if (dev->access_mode == RNBD_ACCESS_RO) {
-		dev->read_only = true;
+	if (dev->access_mode == RNBD_ACCESS_RO)
 		set_disk_ro(dev->gd, true);
-	} else {
-		dev->read_only = false;
-	}
 
 	/*
 	 * Network device does not need rotational
@@ -1413,11 +1402,13 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
 	return err;
 }
 
-static int rnbd_client_setup_device(struct rnbd_clt_dev *dev)
+static int rnbd_client_setup_device(struct rnbd_clt_dev *dev,
+				    struct rnbd_msg_open_rsp *rsp)
 {
 	int idx = dev->clt_device_id;
 
-	dev->size = dev->nsectors * dev->logical_block_size;
+	dev->size = le64_to_cpu(rsp->nsectors) *
+			le16_to_cpu(rsp->logical_block_size);
 
 	dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, dev);
 	if (IS_ERR(dev->gd))
@@ -1425,8 +1416,8 @@ static int rnbd_client_setup_device(struct rnbd_clt_dev *dev)
 	dev->queue = dev->gd->queue;
 	rnbd_init_mq_hw_queues(dev);
 
-	setup_request_queue(dev);
-	return rnbd_clt_setup_gen_disk(dev, idx);
+	setup_request_queue(dev, rsp);
+	return rnbd_clt_setup_gen_disk(dev, rsp, idx);
 }
 
 static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess,
@@ -1562,7 +1553,14 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
 {
 	struct rnbd_clt_session *sess;
 	struct rnbd_clt_dev *dev;
-	int ret;
+	int ret, errno;
+	struct rnbd_msg_open_rsp *rsp;
+	struct rnbd_msg_open msg;
+	struct rnbd_iu *iu;
+	struct kvec vec = {
+		.iov_base = &msg,
+		.iov_len  = sizeof(msg)
+	};
 
 	if (exists_devpath(pathname, sessname))
 		return ERR_PTR(-EEXIST);
@@ -1582,17 +1580,47 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
 		ret = -EEXIST;
 		goto put_dev;
 	}
-	ret = send_msg_open(dev, RTRS_PERMIT_WAIT);
+
+	rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+	if (!rsp) {
+		ret = -ENOMEM;
+		goto del_dev;
+	}
+
+	iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT);
+	if (!iu) {
+		ret = -ENOMEM;
+		kfree(rsp);
+		goto del_dev;
+	}
+	iu->buf = rsp;
+	iu->dev = dev;
+	sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp));
+
+	msg.hdr.type    = cpu_to_le16(RNBD_MSG_OPEN);
+	msg.access_mode = dev->access_mode;
+	strscpy(msg.dev_name, dev->pathname, sizeof(msg.dev_name));
+
+	WARN_ON(!rnbd_clt_get_dev(dev));
+	ret = send_usr_msg(sess->rtrs, READ, iu,
+			   &vec, sizeof(*rsp), iu->sgt.sgl, 1,
+			   msg_open_conf, &errno, RTRS_PERMIT_WAIT);
+	if (ret) {
+		rnbd_clt_put_dev(dev);
+		rnbd_put_iu(sess, iu);
+	} else {
+		ret = errno;
+	}
 	if (ret) {
 		rnbd_clt_err(dev,
 			      "map_device: failed, can't open remote device, err: %d\n",
 			      ret);
-		goto del_dev;
+		goto put_iu;
 	}
 	mutex_lock(&dev->lock);
 	pr_debug("Opened remote device: session=%s, path='%s'\n",
 		 sess->sessname, pathname);
-	ret = rnbd_client_setup_device(dev);
+	ret = rnbd_client_setup_device(dev, rsp);
 	if (ret) {
 		rnbd_clt_err(dev,
 			      "map_device: Failed to configure device, err: %d\n",
@@ -1602,21 +1630,30 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
 	}
 
 	rnbd_clt_info(dev,
-		       "map_device: Device mapped as %s (nsectors: %zu, logical_block_size: %d, physical_block_size: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, wc: %d, fua: %d)\n",
-		       dev->gd->disk_name, dev->nsectors,
-		       dev->logical_block_size, dev->physical_block_size,
-		       dev->max_discard_sectors,
-		       dev->discard_granularity, dev->discard_alignment,
-		       dev->secure_discard, dev->max_segments,
-		       dev->max_hw_sectors, dev->wc, dev->fua);
+		       "map_device: Device mapped as %s (nsectors: %llu, logical_block_size: %d, physical_block_size: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, wc: %d, fua: %d)\n",
+		       dev->gd->disk_name, le64_to_cpu(rsp->nsectors),
+		       le16_to_cpu(rsp->logical_block_size),
+		       le16_to_cpu(rsp->physical_block_size),
+		       le32_to_cpu(rsp->max_discard_sectors),
+		       le32_to_cpu(rsp->discard_granularity),
+		       le32_to_cpu(rsp->discard_alignment),
+		       le16_to_cpu(rsp->secure_discard),
+		       sess->max_segments, sess->max_io_size / SECTOR_SIZE,
+		       !!(rsp->cache_policy & RNBD_WRITEBACK),
+		       !!(rsp->cache_policy & RNBD_FUA));
 
 	mutex_unlock(&dev->lock);
+	kfree(rsp);
+	rnbd_put_iu(sess, iu);
 	rnbd_clt_put_sess(sess);
 
 	return dev;
 
 send_close:
 	send_msg_close(dev, dev->device_id, RTRS_PERMIT_WAIT);
+put_iu:
+	kfree(rsp);
+	rnbd_put_iu(sess, iu);
 del_dev:
 	delete_dev(dev);
 put_dev:
diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h
index 2e2e8c4a85c1..a48e040abe63 100644
--- a/drivers/block/rnbd/rnbd-clt.h
+++ b/drivers/block/rnbd/rnbd-clt.h
@@ -106,6 +106,7 @@ struct rnbd_queue {
 };
 
 struct rnbd_clt_dev {
+	struct kobject		kobj;
 	struct rnbd_clt_session	*sess;
 	struct request_queue	*queue;
 	struct rnbd_queue	*hw_queues;
@@ -114,27 +115,14 @@ struct rnbd_clt_dev {
 	u32			clt_device_id;
 	struct mutex		lock;
 	enum rnbd_clt_dev_state	dev_state;
+	refcount_t		refcount;
 	char			*pathname;
 	enum rnbd_access_mode	access_mode;
 	u32			nr_poll_queues;
-	bool			read_only;
-	bool			wc;
-	bool			fua;
-	u32			max_hw_sectors;
-	u32			max_discard_sectors;
-	u32			discard_granularity;
-	u32			discard_alignment;
-	u16			secure_discard;
-	u16			physical_block_size;
-	u16			logical_block_size;
-	u16			max_segments;
-	size_t			nsectors;
 	u64			size;		/* device size in bytes */
 	struct list_head        list;
 	struct gendisk		*gd;
-	struct kobject		kobj;
 	char			*blk_symlink_name;
-	refcount_t		refcount;
 	struct work_struct	unmap_on_rmmod_work;
 };
 
@@ -150,7 +138,7 @@ int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force,
 			   const struct attribute *sysfs_self);
 
 int rnbd_clt_remap_device(struct rnbd_clt_dev *dev);
-int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize);
+int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, sector_t newsize);
 
 /* rnbd-clt-sysfs.c */
 
diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c
index 0713014bf423..5e08da277ddf 100644
--- a/drivers/block/rnbd/rnbd-srv.c
+++ b/drivers/block/rnbd/rnbd-srv.c
@@ -224,7 +224,6 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id)
 	wait_for_completion(&dc); /* wait for inflights to drop to zero */
 
 	rnbd_dev_close(sess_dev->rnbd_dev);
-	list_del(&sess_dev->sess_list);
 	mutex_lock(&sess_dev->dev->lock);
 	list_del(&sess_dev->dev_list);
 	if (sess_dev->open_flags & FMODE_WRITE)
@@ -239,14 +238,14 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id)
 
 static void destroy_sess(struct rnbd_srv_session *srv_sess)
 {
-	struct rnbd_srv_sess_dev *sess_dev, *tmp;
+	struct rnbd_srv_sess_dev *sess_dev;
+	unsigned long index;
 
-	if (list_empty(&srv_sess->sess_dev_list))
+	if (xa_empty(&srv_sess->index_idr))
 		goto out;
 
 	mutex_lock(&srv_sess->lock);
-	list_for_each_entry_safe(sess_dev, tmp, &srv_sess->sess_dev_list,
-				 sess_list)
+	xa_for_each(&srv_sess->index_idr, index, sess_dev)
 		rnbd_srv_destroy_dev_session_sysfs(sess_dev);
 	mutex_unlock(&srv_sess->lock);
 
@@ -281,7 +280,6 @@ static int create_sess(struct rtrs_srv_sess *rtrs)
 
 	srv_sess->queue_depth = rtrs_srv_get_queue_depth(rtrs);
 	xa_init_flags(&srv_sess->index_idr, XA_FLAGS_ALLOC);
-	INIT_LIST_HEAD(&srv_sess->sess_dev_list);
 	mutex_init(&srv_sess->lock);
 	mutex_lock(&sess_lock);
 	list_add(&srv_sess->list, &sess_list);
@@ -323,10 +321,11 @@ void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev,
 {
 	struct rnbd_srv_session	*sess = sess_dev->sess;
 
-	sess_dev->keep_id = true;
 	/* It is already started to close by client's close message. */
 	if (!mutex_trylock(&sess->lock))
 		return;
+
+	sess_dev->keep_id = true;
 	/* first remove sysfs itself to avoid deadlock */
 	sysfs_remove_file_self(&sess_dev->kobj, &attr->attr);
 	rnbd_srv_destroy_dev_session_sysfs(sess_dev);
@@ -666,11 +665,12 @@ static struct rnbd_srv_sess_dev *
 find_srv_sess_dev(struct rnbd_srv_session *srv_sess, const char *dev_name)
 {
 	struct rnbd_srv_sess_dev *sess_dev;
+	unsigned long index;
 
-	if (list_empty(&srv_sess->sess_dev_list))
+	if (xa_empty(&srv_sess->index_idr))
 		return NULL;
 
-	list_for_each_entry(sess_dev, &srv_sess->sess_dev_list, sess_list)
+	xa_for_each(&srv_sess->index_idr, index, sess_dev)
 		if (!strcmp(sess_dev->pathname, dev_name))
 			return sess_dev;
 
@@ -780,8 +780,6 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
 	list_add(&srv_sess_dev->dev_list, &srv_dev->sess_dev_list);
 	mutex_unlock(&srv_dev->lock);
 
-	list_add(&srv_sess_dev->sess_list, &srv_sess->sess_dev_list);
-
 	rnbd_srv_info(srv_sess_dev, "Opened device '%s'\n", srv_dev->id);
 
 	kfree(full_path);
diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h
index 6926f9069dc4..081bceaf4ae9 100644
--- a/drivers/block/rnbd/rnbd-srv.h
+++ b/drivers/block/rnbd/rnbd-srv.h
@@ -25,8 +25,6 @@ struct rnbd_srv_session {
 	int			queue_depth;
 
 	struct xarray		index_idr;
-	/* List of struct rnbd_srv_sess_dev */
-	struct list_head        sess_dev_list;
 	struct mutex		lock;
 	u8			ver;
 };
@@ -48,8 +46,6 @@ struct rnbd_srv_dev {
 struct rnbd_srv_sess_dev {
 	/* Entry inside rnbd_srv_dev struct */
 	struct list_head		dev_list;
-	/* Entry inside rnbd_srv_session struct */
-	struct list_head		sess_list;
 	struct rnbd_dev			*rnbd_dev;
 	struct rnbd_srv_session		*sess;
 	struct rnbd_srv_dev		*dev;
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
deleted file mode 100644
index 0e1a484cab0b..000000000000
--- a/drivers/block/sx8.c
+++ /dev/null
@@ -1,1582 +0,0 @@
-/*
- *  sx8.c: Driver for Promise SATA SX8 looks-like-I2O hardware
- *
- *  Copyright 2004-2005 Red Hat, Inc.
- *
- *  Author/maintainer:  Jeff Garzik <jgarzik@pobox.com>
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file "COPYING" in the main directory of this archive
- *  for more details.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/blk-mq.h>
-#include <linux/sched.h>
-#include <linux/interrupt.h>
-#include <linux/compiler.h>
-#include <linux/workqueue.h>
-#include <linux/bitops.h>
-#include <linux/delay.h>
-#include <linux/ktime.h>
-#include <linux/hdreg.h>
-#include <linux/dma-mapping.h>
-#include <linux/completion.h>
-#include <linux/scatterlist.h>
-#include <asm/io.h>
-#include <linux/uaccess.h>
-
-#if 0
-#define CARM_DEBUG
-#define CARM_VERBOSE_DEBUG
-#else
-#undef CARM_DEBUG
-#undef CARM_VERBOSE_DEBUG
-#endif
-#undef CARM_NDEBUG
-
-#define DRV_NAME "sx8"
-#define DRV_VERSION "1.0"
-#define PFX DRV_NAME ": "
-
-MODULE_AUTHOR("Jeff Garzik");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Promise SATA SX8 block driver");
-MODULE_VERSION(DRV_VERSION);
-
-/*
- * SX8 hardware has a single message queue for all ATA ports.
- * When this driver was written, the hardware (firmware?) would
- * corrupt data eventually, if more than one request was outstanding.
- * As one can imagine, having 8 ports bottlenecking on a single
- * command hurts performance.
- *
- * Based on user reports, later versions of the hardware (firmware?)
- * seem to be able to survive with more than one command queued.
- *
- * Therefore, we default to the safe option -- 1 command -- but
- * allow the user to increase this.
- *
- * SX8 should be able to support up to ~60 queued commands (CARM_MAX_REQ),
- * but problems seem to occur when you exceed ~30, even on newer hardware.
- */
-static int max_queue = 1;
-module_param(max_queue, int, 0444);
-MODULE_PARM_DESC(max_queue, "Maximum number of queued commands. (min==1, max==30, safe==1)");
-
-
-#define NEXT_RESP(idx)	((idx + 1) % RMSG_Q_LEN)
-
-/* 0xf is just arbitrary, non-zero noise; this is sorta like poisoning */
-#define TAG_ENCODE(tag)	(((tag) << 16) | 0xf)
-#define TAG_DECODE(tag)	(((tag) >> 16) & 0x1f)
-#define TAG_VALID(tag)	((((tag) & 0xf) == 0xf) && (TAG_DECODE(tag) < 32))
-
-/* note: prints function name for you */
-#ifdef CARM_DEBUG
-#define DPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args)
-#ifdef CARM_VERBOSE_DEBUG
-#define VPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args)
-#else
-#define VPRINTK(fmt, args...)
-#endif	/* CARM_VERBOSE_DEBUG */
-#else
-#define DPRINTK(fmt, args...)
-#define VPRINTK(fmt, args...)
-#endif	/* CARM_DEBUG */
-
-#ifdef CARM_NDEBUG
-#define assert(expr)
-#else
-#define assert(expr) \
-        if(unlikely(!(expr))) {                                   \
-        printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
-	#expr, __FILE__, __func__, __LINE__);          \
-        }
-#endif
-
-/* defines only for the constants which don't work well as enums */
-struct carm_host;
-
-enum {
-	/* adapter-wide limits */
-	CARM_MAX_PORTS		= 8,
-	CARM_SHM_SIZE		= (4096 << 7),
-	CARM_MINORS_PER_MAJOR	= 256 / CARM_MAX_PORTS,
-	CARM_MAX_WAIT_Q		= CARM_MAX_PORTS + 1,
-
-	/* command message queue limits */
-	CARM_MAX_REQ		= 64,	       /* max command msgs per host */
-	CARM_MSG_LOW_WATER	= (CARM_MAX_REQ / 4),	     /* refill mark */
-
-	/* S/G limits, host-wide and per-request */
-	CARM_MAX_REQ_SG		= 32,	     /* max s/g entries per request */
-	CARM_MAX_HOST_SG	= 600,		/* max s/g entries per host */
-	CARM_SG_LOW_WATER	= (CARM_MAX_HOST_SG / 4),   /* re-fill mark */
-
-	/* hardware registers */
-	CARM_IHQP		= 0x1c,
-	CARM_INT_STAT		= 0x10, /* interrupt status */
-	CARM_INT_MASK		= 0x14, /* interrupt mask */
-	CARM_HMUC		= 0x18, /* host message unit control */
-	RBUF_ADDR_LO		= 0x20, /* response msg DMA buf low 32 bits */
-	RBUF_ADDR_HI		= 0x24, /* response msg DMA buf high 32 bits */
-	RBUF_BYTE_SZ		= 0x28,
-	CARM_RESP_IDX		= 0x2c,
-	CARM_CMS0		= 0x30, /* command message size reg 0 */
-	CARM_LMUC		= 0x48,
-	CARM_HMPHA		= 0x6c,
-	CARM_INITC		= 0xb5,
-
-	/* bits in CARM_INT_{STAT,MASK} */
-	INT_RESERVED		= 0xfffffff0,
-	INT_WATCHDOG		= (1 << 3),	/* watchdog timer */
-	INT_Q_OVERFLOW		= (1 << 2),	/* cmd msg q overflow */
-	INT_Q_AVAILABLE		= (1 << 1),	/* cmd msg q has free space */
-	INT_RESPONSE		= (1 << 0),	/* response msg available */
-	INT_ACK_MASK		= INT_WATCHDOG | INT_Q_OVERFLOW,
-	INT_DEF_MASK		= INT_RESERVED | INT_Q_OVERFLOW |
-				  INT_RESPONSE,
-
-	/* command messages, and related register bits */
-	CARM_HAVE_RESP		= 0x01,
-	CARM_MSG_READ		= 1,
-	CARM_MSG_WRITE		= 2,
-	CARM_MSG_VERIFY		= 3,
-	CARM_MSG_GET_CAPACITY	= 4,
-	CARM_MSG_FLUSH		= 5,
-	CARM_MSG_IOCTL		= 6,
-	CARM_MSG_ARRAY		= 8,
-	CARM_MSG_MISC		= 9,
-	CARM_CME		= (1 << 2),
-	CARM_RME		= (1 << 1),
-	CARM_WZBC		= (1 << 0),
-	CARM_RMI		= (1 << 0),
-	CARM_Q_FULL		= (1 << 3),
-	CARM_MSG_SIZE		= 288,
-	CARM_Q_LEN		= 48,
-
-	/* CARM_MSG_IOCTL messages */
-	CARM_IOC_SCAN_CHAN	= 5,	/* scan channels for devices */
-	CARM_IOC_GET_TCQ	= 13,	/* get tcq/ncq depth */
-	CARM_IOC_SET_TCQ	= 14,	/* set tcq/ncq depth */
-
-	IOC_SCAN_CHAN_NODEV	= 0x1f,
-	IOC_SCAN_CHAN_OFFSET	= 0x40,
-
-	/* CARM_MSG_ARRAY messages */
-	CARM_ARRAY_INFO		= 0,
-
-	ARRAY_NO_EXIST		= (1 << 31),
-
-	/* response messages */
-	RMSG_SZ			= 8,	/* sizeof(struct carm_response) */
-	RMSG_Q_LEN		= 48,	/* resp. msg list length */
-	RMSG_OK			= 1,	/* bit indicating msg was successful */
-					/* length of entire resp. msg buffer */
-	RBUF_LEN		= RMSG_SZ * RMSG_Q_LEN,
-
-	PDC_SHM_SIZE		= (4096 << 7), /* length of entire h/w buffer */
-
-	/* CARM_MSG_MISC messages */
-	MISC_GET_FW_VER		= 2,
-	MISC_ALLOC_MEM		= 3,
-	MISC_SET_TIME		= 5,
-
-	/* MISC_GET_FW_VER feature bits */
-	FW_VER_4PORT		= (1 << 2), /* 1=4 ports, 0=8 ports */
-	FW_VER_NON_RAID		= (1 << 1), /* 1=non-RAID firmware, 0=RAID */
-	FW_VER_ZCR		= (1 << 0), /* zero channel RAID (whatever that is) */
-
-	/* carm_host flags */
-	FL_NON_RAID		= FW_VER_NON_RAID,
-	FL_4PORT		= FW_VER_4PORT,
-	FL_FW_VER_MASK		= (FW_VER_NON_RAID | FW_VER_4PORT),
-	FL_DYN_MAJOR		= (1 << 17),
-};
-
-enum {
-	CARM_SG_BOUNDARY	= 0xffffUL,	    /* s/g segment boundary */
-};
-
-enum scatter_gather_types {
-	SGT_32BIT		= 0,
-	SGT_64BIT		= 1,
-};
-
-enum host_states {
-	HST_INVALID,		/* invalid state; never used */
-	HST_ALLOC_BUF,		/* setting up master SHM area */
-	HST_ERROR,		/* we never leave here */
-	HST_PORT_SCAN,		/* start dev scan */
-	HST_DEV_SCAN_START,	/* start per-device probe */
-	HST_DEV_SCAN,		/* continue per-device probe */
-	HST_DEV_ACTIVATE,	/* activate devices we found */
-	HST_PROBE_FINISHED,	/* probe is complete */
-	HST_PROBE_START,	/* initiate probe */
-	HST_SYNC_TIME,		/* tell firmware what time it is */
-	HST_GET_FW_VER,		/* get firmware version, adapter port cnt */
-};
-
-#ifdef CARM_DEBUG
-static const char *state_name[] = {
-	"HST_INVALID",
-	"HST_ALLOC_BUF",
-	"HST_ERROR",
-	"HST_PORT_SCAN",
-	"HST_DEV_SCAN_START",
-	"HST_DEV_SCAN",
-	"HST_DEV_ACTIVATE",
-	"HST_PROBE_FINISHED",
-	"HST_PROBE_START",
-	"HST_SYNC_TIME",
-	"HST_GET_FW_VER",
-};
-#endif
-
-struct carm_port {
-	unsigned int			port_no;
-	struct gendisk			*disk;
-	struct carm_host		*host;
-
-	/* attached device characteristics */
-	u64				capacity;
-	char				name[41];
-	u16				dev_geom_head;
-	u16				dev_geom_sect;
-	u16				dev_geom_cyl;
-};
-
-struct carm_request {
-	int				n_elem;
-	unsigned int			msg_type;
-	unsigned int			msg_subtype;
-	unsigned int			msg_bucket;
-	struct scatterlist		sg[CARM_MAX_REQ_SG];
-};
-
-struct carm_host {
-	unsigned long			flags;
-	void				__iomem *mmio;
-	void				*shm;
-	dma_addr_t			shm_dma;
-
-	int				major;
-	int				id;
-	char				name[32];
-
-	spinlock_t			lock;
-	struct pci_dev			*pdev;
-	unsigned int			state;
-	u32				fw_ver;
-
-	struct blk_mq_tag_set		tag_set;
-	struct request_queue		*oob_q;
-	unsigned int			n_oob;
-
-	unsigned int			hw_sg_used;
-
-	unsigned int			resp_idx;
-
-	unsigned int			wait_q_prod;
-	unsigned int			wait_q_cons;
-	struct request_queue		*wait_q[CARM_MAX_WAIT_Q];
-
-	void				*msg_base;
-	dma_addr_t			msg_dma;
-
-	int				cur_scan_dev;
-	unsigned long			dev_active;
-	unsigned long			dev_present;
-	struct carm_port		port[CARM_MAX_PORTS];
-
-	struct work_struct		fsm_task;
-
-	int probe_err;
-	struct completion		probe_comp;
-};
-
-struct carm_response {
-	__le32 ret_handle;
-	__le32 status;
-}  __attribute__((packed));
-
-struct carm_msg_sg {
-	__le32 start;
-	__le32 len;
-}  __attribute__((packed));
-
-struct carm_msg_rw {
-	u8 type;
-	u8 id;
-	u8 sg_count;
-	u8 sg_type;
-	__le32 handle;
-	__le32 lba;
-	__le16 lba_count;
-	__le16 lba_high;
-	struct carm_msg_sg sg[32];
-}  __attribute__((packed));
-
-struct carm_msg_allocbuf {
-	u8 type;
-	u8 subtype;
-	u8 n_sg;
-	u8 sg_type;
-	__le32 handle;
-	__le32 addr;
-	__le32 len;
-	__le32 evt_pool;
-	__le32 n_evt;
-	__le32 rbuf_pool;
-	__le32 n_rbuf;
-	__le32 msg_pool;
-	__le32 n_msg;
-	struct carm_msg_sg sg[8];
-}  __attribute__((packed));
-
-struct carm_msg_ioctl {
-	u8 type;
-	u8 subtype;
-	u8 array_id;
-	u8 reserved1;
-	__le32 handle;
-	__le32 data_addr;
-	u32 reserved2;
-}  __attribute__((packed));
-
-struct carm_msg_sync_time {
-	u8 type;
-	u8 subtype;
-	u16 reserved1;
-	__le32 handle;
-	u32 reserved2;
-	__le32 timestamp;
-}  __attribute__((packed));
-
-struct carm_msg_get_fw_ver {
-	u8 type;
-	u8 subtype;
-	u16 reserved1;
-	__le32 handle;
-	__le32 data_addr;
-	u32 reserved2;
-}  __attribute__((packed));
-
-struct carm_fw_ver {
-	__le32 version;
-	u8 features;
-	u8 reserved1;
-	u16 reserved2;
-}  __attribute__((packed));
-
-struct carm_array_info {
-	__le32 size;
-
-	__le16 size_hi;
-	__le16 stripe_size;
-
-	__le32 mode;
-
-	__le16 stripe_blk_sz;
-	__le16 reserved1;
-
-	__le16 cyl;
-	__le16 head;
-
-	__le16 sect;
-	u8 array_id;
-	u8 reserved2;
-
-	char name[40];
-
-	__le32 array_status;
-
-	/* device list continues beyond this point? */
-}  __attribute__((packed));
-
-static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent);
-static void carm_remove_one (struct pci_dev *pdev);
-static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo);
-
-static const struct pci_device_id carm_pci_tbl[] = {
-	{ PCI_VENDOR_ID_PROMISE, 0x8000, PCI_ANY_ID, PCI_ANY_ID, 0, 0, },
-	{ PCI_VENDOR_ID_PROMISE, 0x8002, PCI_ANY_ID, PCI_ANY_ID, 0, 0, },
-	{ }	/* terminate list */
-};
-MODULE_DEVICE_TABLE(pci, carm_pci_tbl);
-
-static struct pci_driver carm_driver = {
-	.name		= DRV_NAME,
-	.id_table	= carm_pci_tbl,
-	.probe		= carm_init_one,
-	.remove		= carm_remove_one,
-};
-
-static const struct block_device_operations carm_bd_ops = {
-	.owner		= THIS_MODULE,
-	.getgeo		= carm_bdev_getgeo,
-};
-
-static unsigned int carm_host_id;
-static unsigned long carm_major_alloc;
-
-
-
-static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
-	struct carm_port *port = bdev->bd_disk->private_data;
-
-	geo->heads = (u8) port->dev_geom_head;
-	geo->sectors = (u8) port->dev_geom_sect;
-	geo->cylinders = port->dev_geom_cyl;
-	return 0;
-}
-
-static const u32 msg_sizes[] = { 32, 64, 128, CARM_MSG_SIZE };
-
-static inline int carm_lookup_bucket(u32 msg_size)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(msg_sizes); i++)
-		if (msg_size <= msg_sizes[i])
-			return i;
-
-	return -ENOENT;
-}
-
-static void carm_init_buckets(void __iomem *mmio)
-{
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(msg_sizes); i++)
-		writel(msg_sizes[i], mmio + CARM_CMS0 + (4 * i));
-}
-
-static inline void *carm_ref_msg(struct carm_host *host,
-				 unsigned int msg_idx)
-{
-	return host->msg_base + (msg_idx * CARM_MSG_SIZE);
-}
-
-static inline dma_addr_t carm_ref_msg_dma(struct carm_host *host,
-					  unsigned int msg_idx)
-{
-	return host->msg_dma + (msg_idx * CARM_MSG_SIZE);
-}
-
-static int carm_send_msg(struct carm_host *host,
-			 struct carm_request *crq, unsigned tag)
-{
-	void __iomem *mmio = host->mmio;
-	u32 msg = (u32) carm_ref_msg_dma(host, tag);
-	u32 cm_bucket = crq->msg_bucket;
-	u32 tmp;
-	int rc = 0;
-
-	VPRINTK("ENTER\n");
-
-	tmp = readl(mmio + CARM_HMUC);
-	if (tmp & CARM_Q_FULL) {
-#if 0
-		tmp = readl(mmio + CARM_INT_MASK);
-		tmp |= INT_Q_AVAILABLE;
-		writel(tmp, mmio + CARM_INT_MASK);
-		readl(mmio + CARM_INT_MASK);	/* flush */
-#endif
-		DPRINTK("host msg queue full\n");
-		rc = -EBUSY;
-	} else {
-		writel(msg | (cm_bucket << 1), mmio + CARM_IHQP);
-		readl(mmio + CARM_IHQP);	/* flush */
-	}
-
-	return rc;
-}
-
-static int carm_array_info (struct carm_host *host, unsigned int array_idx)
-{
-	struct carm_msg_ioctl *ioc;
-	u32 msg_data;
-	dma_addr_t msg_dma;
-	struct carm_request *crq;
-	struct request *rq;
-	int rc;
-
-	rq = blk_mq_alloc_request(host->oob_q, REQ_OP_DRV_OUT, 0);
-	if (IS_ERR(rq)) {
-		rc = -ENOMEM;
-		goto err_out;
-	}
-	crq = blk_mq_rq_to_pdu(rq);
-
-	ioc = carm_ref_msg(host, rq->tag);
-	msg_dma = carm_ref_msg_dma(host, rq->tag);
-	msg_data = (u32) (msg_dma + sizeof(struct carm_array_info));
-
-	crq->msg_type = CARM_MSG_ARRAY;
-	crq->msg_subtype = CARM_ARRAY_INFO;
-	rc = carm_lookup_bucket(sizeof(struct carm_msg_ioctl) +
-				sizeof(struct carm_array_info));
-	BUG_ON(rc < 0);
-	crq->msg_bucket = (u32) rc;
-
-	memset(ioc, 0, sizeof(*ioc));
-	ioc->type	= CARM_MSG_ARRAY;
-	ioc->subtype	= CARM_ARRAY_INFO;
-	ioc->array_id	= (u8) array_idx;
-	ioc->handle	= cpu_to_le32(TAG_ENCODE(rq->tag));
-	ioc->data_addr	= cpu_to_le32(msg_data);
-
-	spin_lock_irq(&host->lock);
-	assert(host->state == HST_DEV_SCAN_START ||
-	       host->state == HST_DEV_SCAN);
-	spin_unlock_irq(&host->lock);
-
-	DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag);
-	blk_execute_rq_nowait(rq, true);
-
-	return 0;
-
-err_out:
-	spin_lock_irq(&host->lock);
-	host->state = HST_ERROR;
-	spin_unlock_irq(&host->lock);
-	return rc;
-}
-
-typedef unsigned int (*carm_sspc_t)(struct carm_host *, unsigned int, void *);
-
-static int carm_send_special (struct carm_host *host, carm_sspc_t func)
-{
-	struct request *rq;
-	struct carm_request *crq;
-	struct carm_msg_ioctl *ioc;
-	void *mem;
-	unsigned int msg_size;
-	int rc;
-
-	rq = blk_mq_alloc_request(host->oob_q, REQ_OP_DRV_OUT, 0);
-	if (IS_ERR(rq))
-		return -ENOMEM;
-	crq = blk_mq_rq_to_pdu(rq);
-
-	mem = carm_ref_msg(host, rq->tag);
-
-	msg_size = func(host, rq->tag, mem);
-
-	ioc = mem;
-	crq->msg_type = ioc->type;
-	crq->msg_subtype = ioc->subtype;
-	rc = carm_lookup_bucket(msg_size);
-	BUG_ON(rc < 0);
-	crq->msg_bucket = (u32) rc;
-
-	DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag);
-	blk_execute_rq_nowait(rq, true);
-
-	return 0;
-}
-
-static unsigned int carm_fill_sync_time(struct carm_host *host,
-					unsigned int idx, void *mem)
-{
-	struct carm_msg_sync_time *st = mem;
-
-	time64_t tv = ktime_get_real_seconds();
-
-	memset(st, 0, sizeof(*st));
-	st->type	= CARM_MSG_MISC;
-	st->subtype	= MISC_SET_TIME;
-	st->handle	= cpu_to_le32(TAG_ENCODE(idx));
-	st->timestamp	= cpu_to_le32(tv);
-
-	return sizeof(struct carm_msg_sync_time);
-}
-
-static unsigned int carm_fill_alloc_buf(struct carm_host *host,
-					unsigned int idx, void *mem)
-{
-	struct carm_msg_allocbuf *ab = mem;
-
-	memset(ab, 0, sizeof(*ab));
-	ab->type	= CARM_MSG_MISC;
-	ab->subtype	= MISC_ALLOC_MEM;
-	ab->handle	= cpu_to_le32(TAG_ENCODE(idx));
-	ab->n_sg	= 1;
-	ab->sg_type	= SGT_32BIT;
-	ab->addr	= cpu_to_le32(host->shm_dma + (PDC_SHM_SIZE >> 1));
-	ab->len		= cpu_to_le32(PDC_SHM_SIZE >> 1);
-	ab->evt_pool	= cpu_to_le32(host->shm_dma + (16 * 1024));
-	ab->n_evt	= cpu_to_le32(1024);
-	ab->rbuf_pool	= cpu_to_le32(host->shm_dma);
-	ab->n_rbuf	= cpu_to_le32(RMSG_Q_LEN);
-	ab->msg_pool	= cpu_to_le32(host->shm_dma + RBUF_LEN);
-	ab->n_msg	= cpu_to_le32(CARM_Q_LEN);
-	ab->sg[0].start	= cpu_to_le32(host->shm_dma + (PDC_SHM_SIZE >> 1));
-	ab->sg[0].len	= cpu_to_le32(65536);
-
-	return sizeof(struct carm_msg_allocbuf);
-}
-
-static unsigned int carm_fill_scan_channels(struct carm_host *host,
-					    unsigned int idx, void *mem)
-{
-	struct carm_msg_ioctl *ioc = mem;
-	u32 msg_data = (u32) (carm_ref_msg_dma(host, idx) +
-			      IOC_SCAN_CHAN_OFFSET);
-
-	memset(ioc, 0, sizeof(*ioc));
-	ioc->type	= CARM_MSG_IOCTL;
-	ioc->subtype	= CARM_IOC_SCAN_CHAN;
-	ioc->handle	= cpu_to_le32(TAG_ENCODE(idx));
-	ioc->data_addr	= cpu_to_le32(msg_data);
-
-	/* fill output data area with "no device" default values */
-	mem += IOC_SCAN_CHAN_OFFSET;
-	memset(mem, IOC_SCAN_CHAN_NODEV, CARM_MAX_PORTS);
-
-	return IOC_SCAN_CHAN_OFFSET + CARM_MAX_PORTS;
-}
-
-static unsigned int carm_fill_get_fw_ver(struct carm_host *host,
-					 unsigned int idx, void *mem)
-{
-	struct carm_msg_get_fw_ver *ioc = mem;
-	u32 msg_data = (u32) (carm_ref_msg_dma(host, idx) + sizeof(*ioc));
-
-	memset(ioc, 0, sizeof(*ioc));
-	ioc->type	= CARM_MSG_MISC;
-	ioc->subtype	= MISC_GET_FW_VER;
-	ioc->handle	= cpu_to_le32(TAG_ENCODE(idx));
-	ioc->data_addr	= cpu_to_le32(msg_data);
-
-	return sizeof(struct carm_msg_get_fw_ver) +
-	       sizeof(struct carm_fw_ver);
-}
-
-static inline void carm_push_q (struct carm_host *host, struct request_queue *q)
-{
-	unsigned int idx = host->wait_q_prod % CARM_MAX_WAIT_Q;
-
-	blk_mq_stop_hw_queues(q);
-	VPRINTK("STOPPED QUEUE %p\n", q);
-
-	host->wait_q[idx] = q;
-	host->wait_q_prod++;
-	BUG_ON(host->wait_q_prod == host->wait_q_cons); /* overrun */
-}
-
-static inline struct request_queue *carm_pop_q(struct carm_host *host)
-{
-	unsigned int idx;
-
-	if (host->wait_q_prod == host->wait_q_cons)
-		return NULL;
-
-	idx = host->wait_q_cons % CARM_MAX_WAIT_Q;
-	host->wait_q_cons++;
-
-	return host->wait_q[idx];
-}
-
-static inline void carm_round_robin(struct carm_host *host)
-{
-	struct request_queue *q = carm_pop_q(host);
-	if (q) {
-		blk_mq_start_hw_queues(q);
-		VPRINTK("STARTED QUEUE %p\n", q);
-	}
-}
-
-static inline enum dma_data_direction carm_rq_dir(struct request *rq)
-{
-	return op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
-}
-
-static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx,
-				  const struct blk_mq_queue_data *bd)
-{
-	struct request_queue *q = hctx->queue;
-	struct request *rq = bd->rq;
-	struct carm_port *port = q->queuedata;
-	struct carm_host *host = port->host;
-	struct carm_request *crq = blk_mq_rq_to_pdu(rq);
-	struct carm_msg_rw *msg;
-	struct scatterlist *sg;
-	int i, n_elem = 0, rc;
-	unsigned int msg_size;
-	u32 tmp;
-
-	crq->n_elem = 0;
-	sg_init_table(crq->sg, CARM_MAX_REQ_SG);
-
-	blk_mq_start_request(rq);
-
-	spin_lock_irq(&host->lock);
-	if (req_op(rq) == REQ_OP_DRV_OUT)
-		goto send_msg;
-
-	/* get scatterlist from block layer */
-	sg = &crq->sg[0];
-	n_elem = blk_rq_map_sg(q, rq, sg);
-	if (n_elem <= 0)
-		goto out_ioerr;
-
-	/* map scatterlist to PCI bus addresses */
-	n_elem = dma_map_sg(&host->pdev->dev, sg, n_elem, carm_rq_dir(rq));
-	if (n_elem <= 0)
-		goto out_ioerr;
-
-	/* obey global hardware limit on S/G entries */
-	if (host->hw_sg_used >= CARM_MAX_HOST_SG - n_elem)
-		goto out_resource;
-
-	crq->n_elem = n_elem;
-	host->hw_sg_used += n_elem;
-
-	/*
-	 * build read/write message
-	 */
-
-	VPRINTK("build msg\n");
-	msg = (struct carm_msg_rw *) carm_ref_msg(host, rq->tag);
-
-	if (rq_data_dir(rq) == WRITE) {
-		msg->type = CARM_MSG_WRITE;
-		crq->msg_type = CARM_MSG_WRITE;
-	} else {
-		msg->type = CARM_MSG_READ;
-		crq->msg_type = CARM_MSG_READ;
-	}
-
-	msg->id		= port->port_no;
-	msg->sg_count	= n_elem;
-	msg->sg_type	= SGT_32BIT;
-	msg->handle	= cpu_to_le32(TAG_ENCODE(rq->tag));
-	msg->lba	= cpu_to_le32(blk_rq_pos(rq) & 0xffffffff);
-	tmp		= (blk_rq_pos(rq) >> 16) >> 16;
-	msg->lba_high	= cpu_to_le16( (u16) tmp );
-	msg->lba_count	= cpu_to_le16(blk_rq_sectors(rq));
-
-	msg_size = sizeof(struct carm_msg_rw) - sizeof(msg->sg);
-	for (i = 0; i < n_elem; i++) {
-		struct carm_msg_sg *carm_sg = &msg->sg[i];
-		carm_sg->start = cpu_to_le32(sg_dma_address(&crq->sg[i]));
-		carm_sg->len = cpu_to_le32(sg_dma_len(&crq->sg[i]));
-		msg_size += sizeof(struct carm_msg_sg);
-	}
-
-	rc = carm_lookup_bucket(msg_size);
-	BUG_ON(rc < 0);
-	crq->msg_bucket = (u32) rc;
-send_msg:
-	/*
-	 * queue read/write message to hardware
-	 */
-	VPRINTK("send msg, tag == %u\n", rq->tag);
-	rc = carm_send_msg(host, crq, rq->tag);
-	if (rc) {
-		host->hw_sg_used -= n_elem;
-		goto out_resource;
-	}
-
-	spin_unlock_irq(&host->lock);
-	return BLK_STS_OK;
-out_resource:
-	dma_unmap_sg(&host->pdev->dev, &crq->sg[0], n_elem, carm_rq_dir(rq));
-	carm_push_q(host, q);
-	spin_unlock_irq(&host->lock);
-	return BLK_STS_DEV_RESOURCE;
-out_ioerr:
-	carm_round_robin(host);
-	spin_unlock_irq(&host->lock);
-	return BLK_STS_IOERR;
-}
-
-static void carm_handle_array_info(struct carm_host *host,
-				   struct carm_request *crq, u8 *mem,
-				   blk_status_t error)
-{
-	struct carm_port *port;
-	u8 *msg_data = mem + sizeof(struct carm_array_info);
-	struct carm_array_info *desc = (struct carm_array_info *) msg_data;
-	u64 lo, hi;
-	int cur_port;
-	size_t slen;
-
-	DPRINTK("ENTER\n");
-
-	if (error)
-		goto out;
-	if (le32_to_cpu(desc->array_status) & ARRAY_NO_EXIST)
-		goto out;
-
-	cur_port = host->cur_scan_dev;
-
-	/* should never occur */
-	if ((cur_port < 0) || (cur_port >= CARM_MAX_PORTS)) {
-		printk(KERN_ERR PFX "BUG: cur_scan_dev==%d, array_id==%d\n",
-		       cur_port, (int) desc->array_id);
-		goto out;
-	}
-
-	port = &host->port[cur_port];
-
-	lo = (u64) le32_to_cpu(desc->size);
-	hi = (u64) le16_to_cpu(desc->size_hi);
-
-	port->capacity = lo | (hi << 32);
-	port->dev_geom_head = le16_to_cpu(desc->head);
-	port->dev_geom_sect = le16_to_cpu(desc->sect);
-	port->dev_geom_cyl = le16_to_cpu(desc->cyl);
-
-	host->dev_active |= (1 << cur_port);
-
-	strncpy(port->name, desc->name, sizeof(port->name));
-	port->name[sizeof(port->name) - 1] = 0;
-	slen = strlen(port->name);
-	while (slen && (port->name[slen - 1] == ' ')) {
-		port->name[slen - 1] = 0;
-		slen--;
-	}
-
-	printk(KERN_INFO DRV_NAME "(%s): port %u device %Lu sectors\n",
-	       pci_name(host->pdev), port->port_no,
-	       (unsigned long long) port->capacity);
-	printk(KERN_INFO DRV_NAME "(%s): port %u device \"%s\"\n",
-	       pci_name(host->pdev), port->port_no, port->name);
-
-out:
-	assert(host->state == HST_DEV_SCAN);
-	schedule_work(&host->fsm_task);
-}
-
-static void carm_handle_scan_chan(struct carm_host *host,
-				  struct carm_request *crq, u8 *mem,
-				  blk_status_t error)
-{
-	u8 *msg_data = mem + IOC_SCAN_CHAN_OFFSET;
-	unsigned int i, dev_count = 0;
-	int new_state = HST_DEV_SCAN_START;
-
-	DPRINTK("ENTER\n");
-
-	if (error) {
-		new_state = HST_ERROR;
-		goto out;
-	}
-
-	/* TODO: scan and support non-disk devices */
-	for (i = 0; i < 8; i++)
-		if (msg_data[i] == 0) { /* direct-access device (disk) */
-			host->dev_present |= (1 << i);
-			dev_count++;
-		}
-
-	printk(KERN_INFO DRV_NAME "(%s): found %u interesting devices\n",
-	       pci_name(host->pdev), dev_count);
-
-out:
-	assert(host->state == HST_PORT_SCAN);
-	host->state = new_state;
-	schedule_work(&host->fsm_task);
-}
-
-static void carm_handle_generic(struct carm_host *host,
-				struct carm_request *crq, blk_status_t error,
-				int cur_state, int next_state)
-{
-	DPRINTK("ENTER\n");
-
-	assert(host->state == cur_state);
-	if (error)
-		host->state = HST_ERROR;
-	else
-		host->state = next_state;
-	schedule_work(&host->fsm_task);
-}
-
-static inline void carm_handle_resp(struct carm_host *host,
-				    __le32 ret_handle_le, u32 status)
-{
-	u32 handle = le32_to_cpu(ret_handle_le);
-	unsigned int msg_idx;
-	struct request *rq;
-	struct carm_request *crq;
-	blk_status_t error = (status == RMSG_OK) ? 0 : BLK_STS_IOERR;
-	u8 *mem;
-
-	VPRINTK("ENTER, handle == 0x%x\n", handle);
-
-	if (unlikely(!TAG_VALID(handle))) {
-		printk(KERN_ERR DRV_NAME "(%s): BUG: invalid tag 0x%x\n",
-		       pci_name(host->pdev), handle);
-		return;
-	}
-
-	msg_idx = TAG_DECODE(handle);
-	VPRINTK("tag == %u\n", msg_idx);
-
-	rq = blk_mq_tag_to_rq(host->tag_set.tags[0], msg_idx);
-	crq = blk_mq_rq_to_pdu(rq);
-
-	/* fast path */
-	if (likely(crq->msg_type == CARM_MSG_READ ||
-		   crq->msg_type == CARM_MSG_WRITE)) {
-		dma_unmap_sg(&host->pdev->dev, &crq->sg[0], crq->n_elem,
-			     carm_rq_dir(rq));
-		goto done;
-	}
-
-	mem = carm_ref_msg(host, msg_idx);
-
-	switch (crq->msg_type) {
-	case CARM_MSG_IOCTL: {
-		switch (crq->msg_subtype) {
-		case CARM_IOC_SCAN_CHAN:
-			carm_handle_scan_chan(host, crq, mem, error);
-			goto done;
-		default:
-			/* unknown / invalid response */
-			goto err_out;
-		}
-		break;
-	}
-
-	case CARM_MSG_MISC: {
-		switch (crq->msg_subtype) {
-		case MISC_ALLOC_MEM:
-			carm_handle_generic(host, crq, error,
-					    HST_ALLOC_BUF, HST_SYNC_TIME);
-			goto done;
-		case MISC_SET_TIME:
-			carm_handle_generic(host, crq, error,
-					    HST_SYNC_TIME, HST_GET_FW_VER);
-			goto done;
-		case MISC_GET_FW_VER: {
-			struct carm_fw_ver *ver = (struct carm_fw_ver *)
-				(mem + sizeof(struct carm_msg_get_fw_ver));
-			if (!error) {
-				host->fw_ver = le32_to_cpu(ver->version);
-				host->flags |= (ver->features & FL_FW_VER_MASK);
-			}
-			carm_handle_generic(host, crq, error,
-					    HST_GET_FW_VER, HST_PORT_SCAN);
-			goto done;
-		}
-		default:
-			/* unknown / invalid response */
-			goto err_out;
-		}
-		break;
-	}
-
-	case CARM_MSG_ARRAY: {
-		switch (crq->msg_subtype) {
-		case CARM_ARRAY_INFO:
-			carm_handle_array_info(host, crq, mem, error);
-			break;
-		default:
-			/* unknown / invalid response */
-			goto err_out;
-		}
-		break;
-	}
-
-	default:
-		/* unknown / invalid response */
-		goto err_out;
-	}
-
-	return;
-
-err_out:
-	printk(KERN_WARNING DRV_NAME "(%s): BUG: unhandled message type %d/%d\n",
-	       pci_name(host->pdev), crq->msg_type, crq->msg_subtype);
-	error = BLK_STS_IOERR;
-done:
-	host->hw_sg_used -= crq->n_elem;
-	blk_mq_end_request(blk_mq_rq_from_pdu(crq), error);
-
-	if (host->hw_sg_used <= CARM_SG_LOW_WATER)
-		carm_round_robin(host);
-}
-
-static inline void carm_handle_responses(struct carm_host *host)
-{
-	void __iomem *mmio = host->mmio;
-	struct carm_response *resp = (struct carm_response *) host->shm;
-	unsigned int work = 0;
-	unsigned int idx = host->resp_idx % RMSG_Q_LEN;
-
-	while (1) {
-		u32 status = le32_to_cpu(resp[idx].status);
-
-		if (status == 0xffffffff) {
-			VPRINTK("ending response on index %u\n", idx);
-			writel(idx << 3, mmio + CARM_RESP_IDX);
-			break;
-		}
-
-		/* response to a message we sent */
-		else if ((status & (1 << 31)) == 0) {
-			VPRINTK("handling msg response on index %u\n", idx);
-			carm_handle_resp(host, resp[idx].ret_handle, status);
-			resp[idx].status = cpu_to_le32(0xffffffff);
-		}
-
-		/* asynchronous events the hardware throws our way */
-		else if ((status & 0xff000000) == (1 << 31)) {
-			u8 *evt_type_ptr = (u8 *) &resp[idx];
-			u8 evt_type = *evt_type_ptr;
-			printk(KERN_WARNING DRV_NAME "(%s): unhandled event type %d\n",
-			       pci_name(host->pdev), (int) evt_type);
-			resp[idx].status = cpu_to_le32(0xffffffff);
-		}
-
-		idx = NEXT_RESP(idx);
-		work++;
-	}
-
-	VPRINTK("EXIT, work==%u\n", work);
-	host->resp_idx += work;
-}
-
-static irqreturn_t carm_interrupt(int irq, void *__host)
-{
-	struct carm_host *host = __host;
-	void __iomem *mmio;
-	u32 mask;
-	int handled = 0;
-	unsigned long flags;
-
-	if (!host) {
-		VPRINTK("no host\n");
-		return IRQ_NONE;
-	}
-
-	spin_lock_irqsave(&host->lock, flags);
-
-	mmio = host->mmio;
-
-	/* reading should also clear interrupts */
-	mask = readl(mmio + CARM_INT_STAT);
-
-	if (mask == 0 || mask == 0xffffffff) {
-		VPRINTK("no work, mask == 0x%x\n", mask);
-		goto out;
-	}
-
-	if (mask & INT_ACK_MASK)
-		writel(mask, mmio + CARM_INT_STAT);
-
-	if (unlikely(host->state == HST_INVALID)) {
-		VPRINTK("not initialized yet, mask = 0x%x\n", mask);
-		goto out;
-	}
-
-	if (mask & CARM_HAVE_RESP) {
-		handled = 1;
-		carm_handle_responses(host);
-	}
-
-out:
-	spin_unlock_irqrestore(&host->lock, flags);
-	VPRINTK("EXIT\n");
-	return IRQ_RETVAL(handled);
-}
-
-static void carm_fsm_task (struct work_struct *work)
-{
-	struct carm_host *host =
-		container_of(work, struct carm_host, fsm_task);
-	unsigned long flags;
-	unsigned int state;
-	int rc, i, next_dev;
-	int reschedule = 0;
-	int new_state = HST_INVALID;
-
-	spin_lock_irqsave(&host->lock, flags);
-	state = host->state;
-	spin_unlock_irqrestore(&host->lock, flags);
-
-	DPRINTK("ENTER, state == %s\n", state_name[state]);
-
-	switch (state) {
-	case HST_PROBE_START:
-		new_state = HST_ALLOC_BUF;
-		reschedule = 1;
-		break;
-
-	case HST_ALLOC_BUF:
-		rc = carm_send_special(host, carm_fill_alloc_buf);
-		if (rc) {
-			new_state = HST_ERROR;
-			reschedule = 1;
-		}
-		break;
-
-	case HST_SYNC_TIME:
-		rc = carm_send_special(host, carm_fill_sync_time);
-		if (rc) {
-			new_state = HST_ERROR;
-			reschedule = 1;
-		}
-		break;
-
-	case HST_GET_FW_VER:
-		rc = carm_send_special(host, carm_fill_get_fw_ver);
-		if (rc) {
-			new_state = HST_ERROR;
-			reschedule = 1;
-		}
-		break;
-
-	case HST_PORT_SCAN:
-		rc = carm_send_special(host, carm_fill_scan_channels);
-		if (rc) {
-			new_state = HST_ERROR;
-			reschedule = 1;
-		}
-		break;
-
-	case HST_DEV_SCAN_START:
-		host->cur_scan_dev = -1;
-		new_state = HST_DEV_SCAN;
-		reschedule = 1;
-		break;
-
-	case HST_DEV_SCAN:
-		next_dev = -1;
-		for (i = host->cur_scan_dev + 1; i < CARM_MAX_PORTS; i++)
-			if (host->dev_present & (1 << i)) {
-				next_dev = i;
-				break;
-			}
-
-		if (next_dev >= 0) {
-			host->cur_scan_dev = next_dev;
-			rc = carm_array_info(host, next_dev);
-			if (rc) {
-				new_state = HST_ERROR;
-				reschedule = 1;
-			}
-		} else {
-			new_state = HST_DEV_ACTIVATE;
-			reschedule = 1;
-		}
-		break;
-
-	case HST_DEV_ACTIVATE: {
-		int activated = 0;
-		for (i = 0; i < CARM_MAX_PORTS; i++)
-			if (host->dev_active & (1 << i)) {
-				struct carm_port *port = &host->port[i];
-				struct gendisk *disk = port->disk;
-
-				set_capacity(disk, port->capacity);
-				host->probe_err = add_disk(disk);
-				if (!host->probe_err)
-					activated++;
-				else
-					break;
-			}
-
-		printk(KERN_INFO DRV_NAME "(%s): %d ports activated\n",
-		       pci_name(host->pdev), activated);
-
-		new_state = HST_PROBE_FINISHED;
-		reschedule = 1;
-		break;
-	}
-	case HST_PROBE_FINISHED:
-		complete(&host->probe_comp);
-		break;
-	case HST_ERROR:
-		/* FIXME: TODO */
-		break;
-
-	default:
-		/* should never occur */
-		printk(KERN_ERR PFX "BUG: unknown state %d\n", state);
-		assert(0);
-		break;
-	}
-
-	if (new_state != HST_INVALID) {
-		spin_lock_irqsave(&host->lock, flags);
-		host->state = new_state;
-		spin_unlock_irqrestore(&host->lock, flags);
-	}
-	if (reschedule)
-		schedule_work(&host->fsm_task);
-}
-
-static int carm_init_wait(void __iomem *mmio, u32 bits, unsigned int test_bit)
-{
-	unsigned int i;
-
-	for (i = 0; i < 50000; i++) {
-		u32 tmp = readl(mmio + CARM_LMUC);
-		udelay(100);
-
-		if (test_bit) {
-			if ((tmp & bits) == bits)
-				return 0;
-		} else {
-			if ((tmp & bits) == 0)
-				return 0;
-		}
-
-		cond_resched();
-	}
-
-	printk(KERN_ERR PFX "carm_init_wait timeout, bits == 0x%x, test_bit == %s\n",
-	       bits, test_bit ? "yes" : "no");
-	return -EBUSY;
-}
-
-static void carm_init_responses(struct carm_host *host)
-{
-	void __iomem *mmio = host->mmio;
-	unsigned int i;
-	struct carm_response *resp = (struct carm_response *) host->shm;
-
-	for (i = 0; i < RMSG_Q_LEN; i++)
-		resp[i].status = cpu_to_le32(0xffffffff);
-
-	writel(0, mmio + CARM_RESP_IDX);
-}
-
-static int carm_init_host(struct carm_host *host)
-{
-	void __iomem *mmio = host->mmio;
-	u32 tmp;
-	u8 tmp8;
-	int rc;
-
-	DPRINTK("ENTER\n");
-
-	writel(0, mmio + CARM_INT_MASK);
-
-	tmp8 = readb(mmio + CARM_INITC);
-	if (tmp8 & 0x01) {
-		tmp8 &= ~0x01;
-		writeb(tmp8, mmio + CARM_INITC);
-		readb(mmio + CARM_INITC);	/* flush */
-
-		DPRINTK("snooze...\n");
-		msleep(5000);
-	}
-
-	tmp = readl(mmio + CARM_HMUC);
-	if (tmp & CARM_CME) {
-		DPRINTK("CME bit present, waiting\n");
-		rc = carm_init_wait(mmio, CARM_CME, 1);
-		if (rc) {
-			DPRINTK("EXIT, carm_init_wait 1 failed\n");
-			return rc;
-		}
-	}
-	if (tmp & CARM_RME) {
-		DPRINTK("RME bit present, waiting\n");
-		rc = carm_init_wait(mmio, CARM_RME, 1);
-		if (rc) {
-			DPRINTK("EXIT, carm_init_wait 2 failed\n");
-			return rc;
-		}
-	}
-
-	tmp &= ~(CARM_RME | CARM_CME);
-	writel(tmp, mmio + CARM_HMUC);
-	readl(mmio + CARM_HMUC);	/* flush */
-
-	rc = carm_init_wait(mmio, CARM_RME | CARM_CME, 0);
-	if (rc) {
-		DPRINTK("EXIT, carm_init_wait 3 failed\n");
-		return rc;
-	}
-
-	carm_init_buckets(mmio);
-
-	writel(host->shm_dma & 0xffffffff, mmio + RBUF_ADDR_LO);
-	writel((host->shm_dma >> 16) >> 16, mmio + RBUF_ADDR_HI);
-	writel(RBUF_LEN, mmio + RBUF_BYTE_SZ);
-
-	tmp = readl(mmio + CARM_HMUC);
-	tmp |= (CARM_RME | CARM_CME | CARM_WZBC);
-	writel(tmp, mmio + CARM_HMUC);
-	readl(mmio + CARM_HMUC);	/* flush */
-
-	rc = carm_init_wait(mmio, CARM_RME | CARM_CME, 1);
-	if (rc) {
-		DPRINTK("EXIT, carm_init_wait 4 failed\n");
-		return rc;
-	}
-
-	writel(0, mmio + CARM_HMPHA);
-	writel(INT_DEF_MASK, mmio + CARM_INT_MASK);
-
-	carm_init_responses(host);
-
-	/* start initialization, probing state machine */
-	spin_lock_irq(&host->lock);
-	assert(host->state == HST_INVALID);
-	host->state = HST_PROBE_START;
-	spin_unlock_irq(&host->lock);
-	schedule_work(&host->fsm_task);
-
-	DPRINTK("EXIT\n");
-	return 0;
-}
-
-static const struct blk_mq_ops carm_mq_ops = {
-	.queue_rq	= carm_queue_rq,
-};
-
-static int carm_init_disk(struct carm_host *host, unsigned int port_no)
-{
-	struct carm_port *port = &host->port[port_no];
-	struct gendisk *disk;
-
-	port->host = host;
-	port->port_no = port_no;
-
-	disk = blk_mq_alloc_disk(&host->tag_set, port);
-	if (IS_ERR(disk))
-		return PTR_ERR(disk);
-
-	port->disk = disk;
-	sprintf(disk->disk_name, DRV_NAME "/%u",
-		(unsigned int)host->id * CARM_MAX_PORTS + port_no);
-	disk->major = host->major;
-	disk->first_minor = port_no * CARM_MINORS_PER_MAJOR;
-	disk->minors = CARM_MINORS_PER_MAJOR;
-	disk->fops = &carm_bd_ops;
-	disk->private_data = port;
-
-	blk_queue_max_segments(disk->queue, CARM_MAX_REQ_SG);
-	blk_queue_segment_boundary(disk->queue, CARM_SG_BOUNDARY);
-	return 0;
-}
-
-static void carm_free_disk(struct carm_host *host, unsigned int port_no)
-{
-	struct carm_port *port = &host->port[port_no];
-	struct gendisk *disk = port->disk;
-
-	if (!disk)
-		return;
-
-	if (host->state > HST_DEV_ACTIVATE)
-		del_gendisk(disk);
-	put_disk(disk);
-}
-
-static int carm_init_shm(struct carm_host *host)
-{
-	host->shm = dma_alloc_coherent(&host->pdev->dev, CARM_SHM_SIZE,
-				       &host->shm_dma, GFP_KERNEL);
-	if (!host->shm)
-		return -ENOMEM;
-
-	host->msg_base = host->shm + RBUF_LEN;
-	host->msg_dma = host->shm_dma + RBUF_LEN;
-
-	memset(host->shm, 0xff, RBUF_LEN);
-	memset(host->msg_base, 0, PDC_SHM_SIZE - RBUF_LEN);
-
-	return 0;
-}
-
-static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
-{
-	struct carm_host *host;
-	int rc;
-	struct request_queue *q;
-	unsigned int i;
-
-	printk_once(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
-
-	rc = pci_enable_device(pdev);
-	if (rc)
-		return rc;
-
-	rc = pci_request_regions(pdev, DRV_NAME);
-	if (rc)
-		goto err_out;
-
-	rc = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
-	if (rc) {
-		printk(KERN_ERR DRV_NAME "(%s): DMA mask failure\n",
-			pci_name(pdev));
-		goto err_out_regions;
-	}
-
-	host = kzalloc(sizeof(*host), GFP_KERNEL);
-	if (!host) {
-		rc = -ENOMEM;
-		goto err_out_regions;
-	}
-
-	host->pdev = pdev;
-	spin_lock_init(&host->lock);
-	INIT_WORK(&host->fsm_task, carm_fsm_task);
-	init_completion(&host->probe_comp);
-
-	host->mmio = ioremap(pci_resource_start(pdev, 0),
-			     pci_resource_len(pdev, 0));
-	if (!host->mmio) {
-		printk(KERN_ERR DRV_NAME "(%s): MMIO alloc failure\n",
-		       pci_name(pdev));
-		rc = -ENOMEM;
-		goto err_out_kfree;
-	}
-
-	rc = carm_init_shm(host);
-	if (rc) {
-		printk(KERN_ERR DRV_NAME "(%s): DMA SHM alloc failure\n",
-		       pci_name(pdev));
-		goto err_out_iounmap;
-	}
-
-	memset(&host->tag_set, 0, sizeof(host->tag_set));
-	host->tag_set.ops = &carm_mq_ops;
-	host->tag_set.cmd_size = sizeof(struct carm_request);
-	host->tag_set.nr_hw_queues = 1;
-	host->tag_set.nr_maps = 1;
-	host->tag_set.queue_depth = max_queue;
-	host->tag_set.numa_node = NUMA_NO_NODE;
-	host->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
-
-	rc = blk_mq_alloc_tag_set(&host->tag_set);
-	if (rc)
-		goto err_out_dma_free;
-
-	q = blk_mq_init_queue(&host->tag_set);
-	if (IS_ERR(q)) {
-		rc = PTR_ERR(q);
-		blk_mq_free_tag_set(&host->tag_set);
-		goto err_out_dma_free;
-	}
-
-	host->oob_q = q;
-	q->queuedata = host;
-
-	/*
-	 * Figure out which major to use: 160, 161, or dynamic
-	 */
-	if (!test_and_set_bit(0, &carm_major_alloc))
-		host->major = 160;
-	else if (!test_and_set_bit(1, &carm_major_alloc))
-		host->major = 161;
-	else
-		host->flags |= FL_DYN_MAJOR;
-
-	host->id = carm_host_id;
-	sprintf(host->name, DRV_NAME "%d", carm_host_id);
-
-	rc = register_blkdev(host->major, host->name);
-	if (rc < 0)
-		goto err_out_free_majors;
-	if (host->flags & FL_DYN_MAJOR)
-		host->major = rc;
-
-	for (i = 0; i < CARM_MAX_PORTS; i++) {
-		rc = carm_init_disk(host, i);
-		if (rc)
-			goto err_out_blkdev_disks;
-	}
-
-	pci_set_master(pdev);
-
-	rc = request_irq(pdev->irq, carm_interrupt, IRQF_SHARED, DRV_NAME, host);
-	if (rc) {
-		printk(KERN_ERR DRV_NAME "(%s): irq alloc failure\n",
-		       pci_name(pdev));
-		goto err_out_blkdev_disks;
-	}
-
-	rc = carm_init_host(host);
-	if (rc)
-		goto err_out_free_irq;
-
-	DPRINTK("waiting for probe_comp\n");
-	host->probe_err = -ENODEV;
-	wait_for_completion(&host->probe_comp);
-	if (host->probe_err) {
-		rc = host->probe_err;
-		goto err_out_free_irq;
-	}
-
-	printk(KERN_INFO "%s: pci %s, ports %d, io %llx, irq %u, major %d\n",
-	       host->name, pci_name(pdev), (int) CARM_MAX_PORTS,
-	       (unsigned long long)pci_resource_start(pdev, 0),
-		   pdev->irq, host->major);
-
-	carm_host_id++;
-	pci_set_drvdata(pdev, host);
-	return 0;
-
-err_out_free_irq:
-	free_irq(pdev->irq, host);
-err_out_blkdev_disks:
-	for (i = 0; i < CARM_MAX_PORTS; i++)
-		carm_free_disk(host, i);
-	unregister_blkdev(host->major, host->name);
-err_out_free_majors:
-	if (host->major == 160)
-		clear_bit(0, &carm_major_alloc);
-	else if (host->major == 161)
-		clear_bit(1, &carm_major_alloc);
-	blk_mq_destroy_queue(host->oob_q);
-	blk_mq_free_tag_set(&host->tag_set);
-err_out_dma_free:
-	dma_free_coherent(&pdev->dev, CARM_SHM_SIZE, host->shm, host->shm_dma);
-err_out_iounmap:
-	iounmap(host->mmio);
-err_out_kfree:
-	kfree(host);
-err_out_regions:
-	pci_release_regions(pdev);
-err_out:
-	pci_disable_device(pdev);
-	return rc;
-}
-
-static void carm_remove_one (struct pci_dev *pdev)
-{
-	struct carm_host *host = pci_get_drvdata(pdev);
-	unsigned int i;
-
-	if (!host) {
-		printk(KERN_ERR PFX "BUG: no host data for PCI(%s)\n",
-		       pci_name(pdev));
-		return;
-	}
-
-	free_irq(pdev->irq, host);
-	for (i = 0; i < CARM_MAX_PORTS; i++)
-		carm_free_disk(host, i);
-	unregister_blkdev(host->major, host->name);
-	if (host->major == 160)
-		clear_bit(0, &carm_major_alloc);
-	else if (host->major == 161)
-		clear_bit(1, &carm_major_alloc);
-	blk_mq_destroy_queue(host->oob_q);
-	blk_mq_free_tag_set(&host->tag_set);
-	dma_free_coherent(&pdev->dev, CARM_SHM_SIZE, host->shm, host->shm_dma);
-	iounmap(host->mmio);
-	kfree(host);
-	pci_release_regions(pdev);
-	pci_disable_device(pdev);
-}
-
-module_pci_driver(carm_driver);
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 3f1906965ac8..2b7d1db5c4a7 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -47,7 +47,12 @@
 #define UBLK_MINORS		(1U << MINORBITS)
 
 /* All UBLK_F_* have to be included into UBLK_F_ALL */
-#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_URING_CMD_COMP_IN_TASK)
+#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
+		| UBLK_F_URING_CMD_COMP_IN_TASK \
+		| UBLK_F_NEED_GET_DATA)
+
+/* All UBLK_PARAM_TYPE_* should be included here */
+#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
 
 struct ublk_rq_data {
 	struct callback_head work;
@@ -86,6 +91,15 @@ struct ublk_uring_cmd_pdu {
  */
 #define UBLK_IO_FLAG_ABORTED 0x04
 
+/*
+ * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
+ * get data buffer address from ublksrv.
+ *
+ * Then, bio data could be copied into this data buffer for a WRITE request
+ * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
+ */
+#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
+
 struct ublk_io {
 	/* userspace buffer address from io cmd */
 	__u64	addr;
@@ -119,7 +133,6 @@ struct ublk_device {
 	char	*__queues;
 
 	unsigned short  queue_size;
-	unsigned short  bs_shift;
 	struct ublksrv_ctrl_dev_info	dev_info;
 
 	struct blk_mq_tag_set	tag_set;
@@ -137,6 +150,8 @@ struct ublk_device {
 	spinlock_t		mm_lock;
 	struct mm_struct	*mm;
 
+	struct ublk_params	params;
+
 	struct completion	completion;
 	unsigned int		nr_queues_ready;
 	atomic_t		nr_aborted_queues;
@@ -149,6 +164,12 @@ struct ublk_device {
 	struct work_struct	stop_work;
 };
 
+/* header of ublk_params */
+struct ublk_params_header {
+	__u32	len;
+	__u32	types;
+};
+
 static dev_t ublk_chr_devt;
 static struct class *ublk_chr_class;
 
@@ -160,6 +181,90 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
 
 static struct miscdevice ublk_misc;
 
+static void ublk_dev_param_basic_apply(struct ublk_device *ub)
+{
+	struct request_queue *q = ub->ub_disk->queue;
+	const struct ublk_param_basic *p = &ub->params.basic;
+
+	blk_queue_logical_block_size(q, 1 << p->logical_bs_shift);
+	blk_queue_physical_block_size(q, 1 << p->physical_bs_shift);
+	blk_queue_io_min(q, 1 << p->io_min_shift);
+	blk_queue_io_opt(q, 1 << p->io_opt_shift);
+
+	blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE,
+			p->attrs & UBLK_ATTR_FUA);
+	if (p->attrs & UBLK_ATTR_ROTATIONAL)
+		blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
+	else
+		blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
+
+	blk_queue_max_hw_sectors(q, p->max_sectors);
+	blk_queue_chunk_sectors(q, p->chunk_sectors);
+	blk_queue_virt_boundary(q, p->virt_boundary_mask);
+
+	if (p->attrs & UBLK_ATTR_READ_ONLY)
+		set_disk_ro(ub->ub_disk, true);
+
+	set_capacity(ub->ub_disk, p->dev_sectors);
+}
+
+static void ublk_dev_param_discard_apply(struct ublk_device *ub)
+{
+	struct request_queue *q = ub->ub_disk->queue;
+	const struct ublk_param_discard *p = &ub->params.discard;
+
+	q->limits.discard_alignment = p->discard_alignment;
+	q->limits.discard_granularity = p->discard_granularity;
+	blk_queue_max_discard_sectors(q, p->max_discard_sectors);
+	blk_queue_max_write_zeroes_sectors(q,
+			p->max_write_zeroes_sectors);
+	blk_queue_max_discard_segments(q, p->max_discard_segments);
+}
+
+static int ublk_validate_params(const struct ublk_device *ub)
+{
+	/* basic param is the only one which must be set */
+	if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
+		const struct ublk_param_basic *p = &ub->params.basic;
+
+		if (p->logical_bs_shift > PAGE_SHIFT)
+			return -EINVAL;
+
+		if (p->logical_bs_shift > p->physical_bs_shift)
+			return -EINVAL;
+
+		if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
+			return -EINVAL;
+	} else
+		return -EINVAL;
+
+	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
+		const struct ublk_param_discard *p = &ub->params.discard;
+
+		/* So far, only support single segment discard */
+		if (p->max_discard_sectors && p->max_discard_segments != 1)
+			return -EINVAL;
+
+		if (!p->discard_granularity)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ublk_apply_params(struct ublk_device *ub)
+{
+	if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
+		return -EINVAL;
+
+	ublk_dev_param_basic_apply(ub);
+
+	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
+		ublk_dev_param_discard_apply(ub);
+
+	return 0;
+}
+
 static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
 {
 	if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
@@ -168,6 +273,13 @@ static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
 	return false;
 }
 
+static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
+{
+	if (ubq->flags & UBLK_F_NEED_GET_DATA)
+		return true;
+	return false;
+}
+
 static struct ublk_device *ublk_get_device(struct ublk_device *ub)
 {
 	if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
@@ -509,6 +621,21 @@ static void __ublk_fail_req(struct ublk_io *io, struct request *req)
 	}
 }
 
+static void ubq_complete_io_cmd(struct ublk_io *io, int res)
+{
+	/* mark this cmd owned by ublksrv */
+	io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
+
+	/*
+	 * clear ACTIVE since we are done with this sqe/cmd slot
+	 * We can only accept io cmd in case of being not active.
+	 */
+	io->flags &= ~UBLK_IO_FLAG_ACTIVE;
+
+	/* tell ublksrv one io request is coming */
+	io_uring_cmd_done(io->cmd, res, 0);
+}
+
 #define UBLK_REQUEUE_DELAY_MS	3
 
 static inline void __ublk_rq_task_work(struct request *req)
@@ -531,6 +658,30 @@ static inline void __ublk_rq_task_work(struct request *req)
 		return;
 	}
 
+	if (ublk_need_get_data(ubq) &&
+			(req_op(req) == REQ_OP_WRITE ||
+			req_op(req) == REQ_OP_FLUSH)) {
+		/*
+		 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
+		 * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
+		 * and notify it.
+		 */
+		if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
+			io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
+			pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
+					__func__, io->cmd->cmd_op, ubq->q_id,
+					req->tag, io->flags);
+			ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA);
+			return;
+		}
+		/*
+		 * We have handled UBLK_IO_NEED_GET_DATA command,
+		 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
+		 * do the copy work.
+		 */
+		io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
+	}
+
 	mapped_bytes = ublk_map_io(ubq, req, io);
 
 	/* partially mapped, update io descriptor */
@@ -553,17 +704,7 @@ static inline void __ublk_rq_task_work(struct request *req)
 			mapped_bytes >> 9;
 	}
 
-	/* mark this cmd owned by ublksrv */
-	io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
-
-	/*
-	 * clear ACTIVE since we are done with this sqe/cmd slot
-	 * We can only accept io cmd in case of being not active.
-	 */
-	io->flags &= ~UBLK_IO_FLAG_ACTIVE;
-
-	/* tell ublksrv one io request is coming */
-	io_uring_cmd_done(io->cmd, UBLK_IO_RES_OK, 0);
+	ubq_complete_io_cmd(io, UBLK_IO_RES_OK);
 }
 
 static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
@@ -788,16 +929,27 @@ static void ublk_daemon_monitor_work(struct work_struct *work)
 				UBLK_DAEMON_MONITOR_PERIOD);
 }
 
+static inline bool ublk_queue_ready(struct ublk_queue *ubq)
+{
+	return ubq->nr_io_ready == ubq->q_depth;
+}
+
 static void ublk_cancel_queue(struct ublk_queue *ubq)
 {
 	int i;
 
+	if (!ublk_queue_ready(ubq))
+		return;
+
 	for (i = 0; i < ubq->q_depth; i++) {
 		struct ublk_io *io = &ubq->ios[i];
 
 		if (io->flags & UBLK_IO_FLAG_ACTIVE)
 			io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0);
 	}
+
+	/* all io commands are canceled */
+	ubq->nr_io_ready = 0;
 }
 
 /* Cancel all pending commands, must be called after del_gendisk() returns */
@@ -818,19 +970,14 @@ static void ublk_stop_dev(struct ublk_device *ub)
 	del_gendisk(ub->ub_disk);
 	ub->dev_info.state = UBLK_S_DEV_DEAD;
 	ub->dev_info.ublksrv_pid = -1;
-	ublk_cancel_dev(ub);
 	put_disk(ub->ub_disk);
 	ub->ub_disk = NULL;
  unlock:
+	ublk_cancel_dev(ub);
 	mutex_unlock(&ub->mutex);
 	cancel_delayed_work_sync(&ub->monitor_work);
 }
 
-static inline bool ublk_queue_ready(struct ublk_queue *ubq)
-{
-	return ubq->nr_io_ready == ubq->q_depth;
-}
-
 /* device can only be started after all IOs are ready */
 static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
 {
@@ -846,6 +993,25 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
 	mutex_unlock(&ub->mutex);
 }
 
+static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
+		int tag, struct io_uring_cmd *cmd)
+{
+	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
+	struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
+
+	if (ublk_can_use_task_work(ubq)) {
+		struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+		/* should not fail since we call it just in ubq->ubq_daemon */
+		task_work_add(ubq->ubq_daemon, &data->work, TWA_SIGNAL_NO_IPI);
+	} else {
+		struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+		pdu->req = req;
+		io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
+	}
+}
+
 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 {
 	struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd;
@@ -884,6 +1050,14 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 		goto out;
 	}
 
+	/*
+	 * ensure that the user issues UBLK_IO_NEED_GET_DATA
+	 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
+	 */
+	if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
+			^ (cmd_op == UBLK_IO_NEED_GET_DATA))
+		goto out;
+
 	switch (cmd_op) {
 	case UBLK_IO_FETCH_REQ:
 		/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
@@ -917,6 +1091,14 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 		io->cmd = cmd;
 		ublk_commit_completion(ub, ub_cmd);
 		break;
+	case UBLK_IO_NEED_GET_DATA:
+		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
+			goto out;
+		io->addr = ub_cmd->addr;
+		io->cmd = cmd;
+		io->flags |= UBLK_IO_FLAG_ACTIVE;
+		ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag, cmd);
+		break;
 	default:
 		goto out;
 	}
@@ -1083,13 +1265,13 @@ static void ublk_stop_work_fn(struct work_struct *work)
 	ublk_stop_dev(ub);
 }
 
-/* align maximum I/O size to PAGE_SIZE */
+/* align max io buffer size with PAGE_SIZE */
 static void ublk_align_max_io_size(struct ublk_device *ub)
 {
-	unsigned int max_rq_bytes = ub->dev_info.rq_max_blocks << ub->bs_shift;
+	unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
 
-	ub->dev_info.rq_max_blocks =
-		round_down(max_rq_bytes, PAGE_SIZE) >> ub->bs_shift;
+	ub->dev_info.max_io_buf_bytes =
+		round_down(max_io_bytes, PAGE_SIZE);
 }
 
 static int ublk_add_tag_set(struct ublk_device *ub)
@@ -1132,7 +1314,6 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
 {
 	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
 	int ublksrv_pid = (int)header->data[0];
-	unsigned long dev_blocks = header->data[1];
 	struct ublk_device *ub;
 	struct gendisk *disk;
 	int ret = -EINVAL;
@@ -1155,10 +1336,6 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
 		goto out_unlock;
 	}
 
-	/* We may get disk size updated */
-	if (dev_blocks)
-		ub->dev_info.dev_blocks = dev_blocks;
-
 	disk = blk_mq_alloc_disk(&ub->tag_set, ub);
 	if (IS_ERR(disk)) {
 		ret = PTR_ERR(disk);
@@ -1168,27 +1345,28 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
 	disk->fops = &ub_fops;
 	disk->private_data = ub;
 
-	blk_queue_logical_block_size(disk->queue, ub->dev_info.block_size);
-	blk_queue_physical_block_size(disk->queue, ub->dev_info.block_size);
-	blk_queue_io_min(disk->queue, ub->dev_info.block_size);
-	blk_queue_max_hw_sectors(disk->queue,
-		ub->dev_info.rq_max_blocks << (ub->bs_shift - 9));
-	disk->queue->limits.discard_granularity = PAGE_SIZE;
-	blk_queue_max_discard_sectors(disk->queue, UINT_MAX >> 9);
-	blk_queue_max_write_zeroes_sectors(disk->queue, UINT_MAX >> 9);
-
-	set_capacity(disk, ub->dev_info.dev_blocks << (ub->bs_shift - 9));
-
 	ub->dev_info.ublksrv_pid = ublksrv_pid;
 	ub->ub_disk = disk;
+
+	ret = ublk_apply_params(ub);
+	if (ret)
+		goto out_put_disk;
+
 	get_device(&ub->cdev_dev);
 	ret = add_disk(disk);
 	if (ret) {
-		put_disk(disk);
-		goto out_unlock;
+		/*
+		 * Has to drop the reference since ->free_disk won't be
+		 * called in case of add_disk failure.
+		 */
+		ublk_put_device(ub);
+		goto out_put_disk;
 	}
 	set_bit(UB_STATE_USED, &ub->state);
 	ub->dev_info.state = UBLK_S_DEV_LIVE;
+out_put_disk:
+	if (ret)
+		put_disk(disk);
 out_unlock:
 	mutex_unlock(&ub->mutex);
 	ublk_put_device(ub);
@@ -1250,9 +1428,8 @@ static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
 {
 	pr_devel("%s: dev id %d flags %llx\n", __func__,
 			info->dev_id, info->flags);
-	pr_devel("\t nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
-			info->nr_hw_queues, info->queue_depth,
-			info->block_size, info->dev_blocks);
+	pr_devel("\t nr_hw_queues %d queue_depth %d\n",
+			info->nr_hw_queues, info->queue_depth);
 }
 
 static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
@@ -1312,7 +1489,6 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
 	/* We are not ready to support zero copy */
 	ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
 
-	ub->bs_shift = ilog2(ub->dev_info.block_size);
 	ub->dev_info.nr_hw_queues = min_t(unsigned int,
 			ub->dev_info.nr_hw_queues, nr_cpu_ids);
 	ublk_align_max_io_size(ub);
@@ -1436,6 +1612,82 @@ static int ublk_ctrl_get_dev_info(struct io_uring_cmd *cmd)
 	return ret;
 }
 
+static int ublk_ctrl_get_params(struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	void __user *argp = (void __user *)(unsigned long)header->addr;
+	struct ublk_params_header ph;
+	struct ublk_device *ub;
+	int ret;
+
+	if (header->len <= sizeof(ph) || !header->addr)
+		return -EINVAL;
+
+	if (copy_from_user(&ph, argp, sizeof(ph)))
+		return -EFAULT;
+
+	if (ph.len > header->len || !ph.len)
+		return -EINVAL;
+
+	if (ph.len > sizeof(struct ublk_params))
+		ph.len = sizeof(struct ublk_params);
+
+	ub = ublk_get_device_from_id(header->dev_id);
+	if (!ub)
+		return -EINVAL;
+
+	mutex_lock(&ub->mutex);
+	if (copy_to_user(argp, &ub->params, ph.len))
+		ret = -EFAULT;
+	else
+		ret = 0;
+	mutex_unlock(&ub->mutex);
+
+	ublk_put_device(ub);
+	return ret;
+}
+
+static int ublk_ctrl_set_params(struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	void __user *argp = (void __user *)(unsigned long)header->addr;
+	struct ublk_params_header ph;
+	struct ublk_device *ub;
+	int ret = -EFAULT;
+
+	if (header->len <= sizeof(ph) || !header->addr)
+		return -EINVAL;
+
+	if (copy_from_user(&ph, argp, sizeof(ph)))
+		return -EFAULT;
+
+	if (ph.len > header->len || !ph.len || !ph.types)
+		return -EINVAL;
+
+	if (ph.len > sizeof(struct ublk_params))
+		ph.len = sizeof(struct ublk_params);
+
+	ub = ublk_get_device_from_id(header->dev_id);
+	if (!ub)
+		return -EINVAL;
+
+	/* parameters can only be changed when device isn't live */
+	mutex_lock(&ub->mutex);
+	if (ub->dev_info.state == UBLK_S_DEV_LIVE) {
+		ret = -EACCES;
+	} else if (copy_from_user(&ub->params, argp, ph.len)) {
+		ret = -EFAULT;
+	} else {
+		/* clear all we don't support yet */
+		ub->params.types &= UBLK_PARAM_TYPE_ALL;
+		ret = ublk_validate_params(ub);
+	}
+	mutex_unlock(&ub->mutex);
+	ublk_put_device(ub);
+
+	return ret;
+}
+
 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 		unsigned int issue_flags)
 {
@@ -1471,6 +1723,12 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 	case UBLK_CMD_GET_QUEUE_AFFINITY:
 		ret = ublk_ctrl_get_queue_affinity(cmd);
 		break;
+	case UBLK_CMD_GET_PARAMS:
+		ret = ublk_ctrl_get_params(cmd);
+		break;
+	case UBLK_CMD_SET_PARAMS:
+		ret = ublk_ctrl_set_params(cmd);
+		break;
 	default:
 		break;
 	}
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 33d3ce9c888e..aa36ac618e72 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -78,20 +78,21 @@ config INFINIBAND_VIRT_DMA
 	def_bool !HIGHMEM
 
 if INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS
-source "drivers/infiniband/hw/mthca/Kconfig"
-source "drivers/infiniband/hw/qib/Kconfig"
+source "drivers/infiniband/hw/bnxt_re/Kconfig"
 source "drivers/infiniband/hw/cxgb4/Kconfig"
 source "drivers/infiniband/hw/efa/Kconfig"
+source "drivers/infiniband/hw/erdma/Kconfig"
+source "drivers/infiniband/hw/hfi1/Kconfig"
+source "drivers/infiniband/hw/hns/Kconfig"
 source "drivers/infiniband/hw/irdma/Kconfig"
 source "drivers/infiniband/hw/mlx4/Kconfig"
 source "drivers/infiniband/hw/mlx5/Kconfig"
+source "drivers/infiniband/hw/mthca/Kconfig"
 source "drivers/infiniband/hw/ocrdma/Kconfig"
-source "drivers/infiniband/hw/vmw_pvrdma/Kconfig"
-source "drivers/infiniband/hw/usnic/Kconfig"
-source "drivers/infiniband/hw/hns/Kconfig"
-source "drivers/infiniband/hw/bnxt_re/Kconfig"
-source "drivers/infiniband/hw/hfi1/Kconfig"
 source "drivers/infiniband/hw/qedr/Kconfig"
+source "drivers/infiniband/hw/qib/Kconfig"
+source "drivers/infiniband/hw/usnic/Kconfig"
+source "drivers/infiniband/hw/vmw_pvrdma/Kconfig"
 source "drivers/infiniband/sw/rdmavt/Kconfig"
 source "drivers/infiniband/sw/rxe/Kconfig"
 source "drivers/infiniband/sw/siw/Kconfig"
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index fabca5e51e3d..46d06678dfbe 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -11,6 +11,7 @@
 #include <linux/in6.h>
 #include <linux/mutex.h>
 #include <linux/random.h>
+#include <linux/rbtree.h>
 #include <linux/igmp.h>
 #include <linux/xarray.h>
 #include <linux/inetdevice.h>
@@ -20,6 +21,7 @@
 
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/netevent.h>
 #include <net/tcp.h>
 #include <net/ipv6.h>
 #include <net/ip_fib.h>
@@ -168,6 +170,9 @@ static struct ib_sa_client sa_client;
 static LIST_HEAD(dev_list);
 static LIST_HEAD(listen_any_list);
 static DEFINE_MUTEX(lock);
+static struct rb_root id_table = RB_ROOT;
+/* Serialize operations of id_table tree */
+static DEFINE_SPINLOCK(id_table_lock);
 static struct workqueue_struct *cma_wq;
 static unsigned int cma_pernet_id;
 
@@ -202,6 +207,11 @@ struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps)
 	}
 }
 
+struct id_table_entry {
+	struct list_head id_list;
+	struct rb_node rb_node;
+};
+
 struct cma_device {
 	struct list_head	list;
 	struct ib_device	*device;
@@ -420,11 +430,21 @@ static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr)
 	return hdr->ip_version >> 4;
 }
 
-static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
+static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
 {
 	hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
 }
 
+static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv)
+{
+	return (struct sockaddr *)&id_priv->id.route.addr.src_addr;
+}
+
+static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv)
+{
+	return (struct sockaddr *)&id_priv->id.route.addr.dst_addr;
+}
+
 static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join)
 {
 	struct in_device *in_dev = NULL;
@@ -445,6 +465,117 @@ static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join)
 	return (in_dev) ? 0 : -ENODEV;
 }
 
+static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa,
+				 struct id_table_entry *entry_b)
+{
+	struct rdma_id_private *id_priv = list_first_entry(
+		&entry_b->id_list, struct rdma_id_private, id_list_entry);
+	int ifindex_b = id_priv->id.route.addr.dev_addr.bound_dev_if;
+	struct sockaddr *sb = cma_dst_addr(id_priv);
+
+	if (ifindex_a != ifindex_b)
+		return (ifindex_a > ifindex_b) ? 1 : -1;
+
+	if (sa->sa_family != sb->sa_family)
+		return sa->sa_family - sb->sa_family;
+
+	if (sa->sa_family == AF_INET)
+		return memcmp((char *)&((struct sockaddr_in *)sa)->sin_addr,
+			      (char *)&((struct sockaddr_in *)sb)->sin_addr,
+			      sizeof(((struct sockaddr_in *)sa)->sin_addr));
+
+	return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr,
+			     &((struct sockaddr_in6 *)sb)->sin6_addr);
+}
+
+static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv)
+{
+	struct rb_node **new, *parent = NULL;
+	struct id_table_entry *this, *node;
+	unsigned long flags;
+	int result;
+
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return -ENOMEM;
+
+	spin_lock_irqsave(&id_table_lock, flags);
+	new = &id_table.rb_node;
+	while (*new) {
+		this = container_of(*new, struct id_table_entry, rb_node);
+		result = compare_netdev_and_ip(
+			node_id_priv->id.route.addr.dev_addr.bound_dev_if,
+			cma_dst_addr(node_id_priv), this);
+
+		parent = *new;
+		if (result < 0)
+			new = &((*new)->rb_left);
+		else if (result > 0)
+			new = &((*new)->rb_right);
+		else {
+			list_add_tail(&node_id_priv->id_list_entry,
+				      &this->id_list);
+			kfree(node);
+			goto unlock;
+		}
+	}
+
+	INIT_LIST_HEAD(&node->id_list);
+	list_add_tail(&node_id_priv->id_list_entry, &node->id_list);
+
+	rb_link_node(&node->rb_node, parent, new);
+	rb_insert_color(&node->rb_node, &id_table);
+
+unlock:
+	spin_unlock_irqrestore(&id_table_lock, flags);
+	return 0;
+}
+
+static struct id_table_entry *
+node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa)
+{
+	struct rb_node *node = root->rb_node;
+	struct id_table_entry *data;
+	int result;
+
+	while (node) {
+		data = container_of(node, struct id_table_entry, rb_node);
+		result = compare_netdev_and_ip(ifindex, sa, data);
+		if (result < 0)
+			node = node->rb_left;
+		else if (result > 0)
+			node = node->rb_right;
+		else
+			return data;
+	}
+
+	return NULL;
+}
+
+static void cma_remove_id_from_tree(struct rdma_id_private *id_priv)
+{
+	struct id_table_entry *data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&id_table_lock, flags);
+	if (list_empty(&id_priv->id_list_entry))
+		goto out;
+
+	data = node_from_ndev_ip(&id_table,
+				 id_priv->id.route.addr.dev_addr.bound_dev_if,
+				 cma_dst_addr(id_priv));
+	if (!data)
+		goto out;
+
+	list_del_init(&id_priv->id_list_entry);
+	if (list_empty(&data->id_list)) {
+		rb_erase(&data->rb_node, &id_table);
+		kfree(data);
+	}
+out:
+	spin_unlock_irqrestore(&id_table_lock, flags);
+}
+
 static void _cma_attach_to_dev(struct rdma_id_private *id_priv,
 			       struct cma_device *cma_dev)
 {
@@ -481,16 +612,6 @@ static void cma_release_dev(struct rdma_id_private *id_priv)
 	mutex_unlock(&lock);
 }
 
-static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv)
-{
-	return (struct sockaddr *) &id_priv->id.route.addr.src_addr;
-}
-
-static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv)
-{
-	return (struct sockaddr *) &id_priv->id.route.addr.dst_addr;
-}
-
 static inline unsigned short cma_family(struct rdma_id_private *id_priv)
 {
 	return id_priv->id.route.addr.src_addr.ss_family;
@@ -861,6 +982,7 @@ __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler,
 	refcount_set(&id_priv->refcount, 1);
 	mutex_init(&id_priv->handler_mutex);
 	INIT_LIST_HEAD(&id_priv->device_item);
+	INIT_LIST_HEAD(&id_priv->id_list_entry);
 	INIT_LIST_HEAD(&id_priv->listen_list);
 	INIT_LIST_HEAD(&id_priv->mc_list);
 	get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
@@ -1883,6 +2005,7 @@ static void _destroy_id(struct rdma_id_private *id_priv,
 	cma_cancel_operation(id_priv, state);
 
 	rdma_restrack_del(&id_priv->res);
+	cma_remove_id_from_tree(id_priv);
 	if (id_priv->cma_dev) {
 		if (rdma_cap_ib_cm(id_priv->id.device, 1)) {
 			if (id_priv->cm_id.ib)
@@ -3172,8 +3295,11 @@ int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms)
 	cma_id_get(id_priv);
 	if (rdma_cap_ib_sa(id->device, id->port_num))
 		ret = cma_resolve_ib_route(id_priv, timeout_ms);
-	else if (rdma_protocol_roce(id->device, id->port_num))
+	else if (rdma_protocol_roce(id->device, id->port_num)) {
 		ret = cma_resolve_iboe_route(id_priv);
+		if (!ret)
+			cma_add_id_to_tree(id_priv);
+	}
 	else if (rdma_protocol_iwarp(id->device, id->port_num))
 		ret = cma_resolve_iw_route(id_priv);
 	else
@@ -4922,10 +5048,87 @@ out:
 	return ret;
 }
 
+static void cma_netevent_work_handler(struct work_struct *_work)
+{
+	struct rdma_id_private *id_priv =
+		container_of(_work, struct rdma_id_private, id.net_work);
+	struct rdma_cm_event event = {};
+
+	mutex_lock(&id_priv->handler_mutex);
+
+	if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING ||
+	    READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL)
+		goto out_unlock;
+
+	event.event = RDMA_CM_EVENT_UNREACHABLE;
+	event.status = -ETIMEDOUT;
+
+	if (cma_cm_event_handler(id_priv, &event)) {
+		__acquire(&id_priv->handler_mutex);
+		id_priv->cm_id.ib = NULL;
+		cma_id_put(id_priv);
+		destroy_id_handler_unlock(id_priv);
+		return;
+	}
+
+out_unlock:
+	mutex_unlock(&id_priv->handler_mutex);
+	cma_id_put(id_priv);
+}
+
+static int cma_netevent_callback(struct notifier_block *self,
+				 unsigned long event, void *ctx)
+{
+	struct id_table_entry *ips_node = NULL;
+	struct rdma_id_private *current_id;
+	struct neighbour *neigh = ctx;
+	unsigned long flags;
+
+	if (event != NETEVENT_NEIGH_UPDATE)
+		return NOTIFY_DONE;
+
+	spin_lock_irqsave(&id_table_lock, flags);
+	if (neigh->tbl->family == AF_INET6) {
+		struct sockaddr_in6 neigh_sock_6;
+
+		neigh_sock_6.sin6_family = AF_INET6;
+		neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key;
+		ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex,
+					     (struct sockaddr *)&neigh_sock_6);
+	} else if (neigh->tbl->family == AF_INET) {
+		struct sockaddr_in neigh_sock_4;
+
+		neigh_sock_4.sin_family = AF_INET;
+		neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key);
+		ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex,
+					     (struct sockaddr *)&neigh_sock_4);
+	} else
+		goto out;
+
+	if (!ips_node)
+		goto out;
+
+	list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) {
+		if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr,
+			   neigh->ha, ETH_ALEN))
+			continue;
+		INIT_WORK(&current_id->id.net_work, cma_netevent_work_handler);
+		cma_id_get(current_id);
+		queue_work(cma_wq, &current_id->id.net_work);
+	}
+out:
+	spin_unlock_irqrestore(&id_table_lock, flags);
+	return NOTIFY_DONE;
+}
+
 static struct notifier_block cma_nb = {
 	.notifier_call = cma_netdev_callback
 };
 
+static struct notifier_block cma_netevent_cb = {
+	.notifier_call = cma_netevent_callback
+};
+
 static void cma_send_device_removal_put(struct rdma_id_private *id_priv)
 {
 	struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL };
@@ -5148,6 +5351,7 @@ static int __init cma_init(void)
 
 	ib_sa_register_client(&sa_client);
 	register_netdevice_notifier(&cma_nb);
+	register_netevent_notifier(&cma_netevent_cb);
 
 	ret = ib_register_client(&cma_client);
 	if (ret)
@@ -5162,6 +5366,7 @@ static int __init cma_init(void)
 err_ib:
 	ib_unregister_client(&cma_client);
 err:
+	unregister_netevent_notifier(&cma_netevent_cb);
 	unregister_netdevice_notifier(&cma_nb);
 	ib_sa_unregister_client(&sa_client);
 	unregister_pernet_subsys(&cma_pernet_operations);
@@ -5174,6 +5379,7 @@ static void __exit cma_cleanup(void)
 {
 	cma_configfs_exit();
 	ib_unregister_client(&cma_client);
+	unregister_netevent_notifier(&cma_netevent_cb);
 	unregister_netdevice_notifier(&cma_nb);
 	ib_sa_unregister_client(&sa_client);
 	unregister_pernet_subsys(&cma_pernet_operations);
diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h
index 757a0ef79872..b7354c94cf1b 100644
--- a/drivers/infiniband/core/cma_priv.h
+++ b/drivers/infiniband/core/cma_priv.h
@@ -64,6 +64,7 @@ struct rdma_id_private {
 		struct list_head listen_item;
 		struct list_head listen_list;
 	};
+	struct list_head        id_list_entry;
 	struct cma_device	*cma_dev;
 	struct list_head	mc_list;
 
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 94d83b665a2f..29b1ab1d5f93 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -68,7 +68,7 @@ static int uverbs_try_lock_object(struct ib_uobject *uobj,
 	 * In exclusive access mode, we check that the counter is zero (nobody
 	 * claimed this object) and we set it to -1. Releasing a shared access
 	 * lock is done simply by decreasing the counter. As for exclusive
-	 * access locks, since only a single one of them is is allowed
+	 * access locks, since only a single one of them is allowed
 	 * concurrently, setting the counter to zero is enough for releasing
 	 * this lock.
 	 */
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
index 68197e576433..e958c43dd28f 100644
--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -250,7 +250,7 @@ static bool upper_device_filter(struct ib_device *ib_dev, u32 port,
 
 /**
  * is_upper_ndev_bond_master_filter - Check if a given netdevice
- * is bond master device of netdevice of the the RDMA device of port.
+ * is bond master device of netdevice of the RDMA device of port.
  * @ib_dev:		IB device to check
  * @port:		Port to consider for adding default GID
  * @rdma_ndev:		Pointer to rdma netdevice
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
index fba0b3be903e..6b3a88046125 100644
--- a/drivers/infiniband/hw/Makefile
+++ b/drivers/infiniband/hw/Makefile
@@ -13,3 +13,4 @@ obj-$(CONFIG_INFINIBAND_HFI1)		+= hfi1/
 obj-$(CONFIG_INFINIBAND_HNS)		+= hns/
 obj-$(CONFIG_INFINIBAND_QEDR)		+= qedr/
 obj-$(CONFIG_INFINIBAND_BNXT_RE)	+= bnxt_re/
+obj-$(CONFIG_INFINIBAND_ERDMA)		+= erdma/
diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
index 79401e6c6aa9..785c37cae3c0 100644
--- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -173,7 +173,7 @@ struct bnxt_re_dev {
 	/* Max of 2 lossless traffic class supported per port */
 	u16				cosq[2];
 
-	/* QP for for handling QP1 packets */
+	/* QP for handling QP1 packets */
 	struct bnxt_re_gsi_context	gsi_ctx;
 	struct bnxt_re_stats		stats;
 	atomic_t nq_alloc_cnt;
diff --git a/drivers/infiniband/hw/erdma/Kconfig b/drivers/infiniband/hw/erdma/Kconfig
new file mode 100644
index 000000000000..169038e3ceb1
--- /dev/null
+++ b/drivers/infiniband/hw/erdma/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config INFINIBAND_ERDMA
+	tristate "Alibaba Elastic RDMA Adapter (ERDMA) support"
+	depends on PCI_MSI && 64BIT
+	depends on INFINIBAND_ADDR_TRANS
+	depends on INFINIBAND_USER_ACCESS
+	help
+	  This is a RDMA/iWarp driver for Alibaba Elastic RDMA Adapter(ERDMA),
+	  which supports RDMA features in Alibaba cloud environment.
+
+	  To compile this driver as module, choose M here. The module will be
+	  called erdma.
diff --git a/drivers/infiniband/hw/erdma/Makefile b/drivers/infiniband/hw/erdma/Makefile
new file mode 100644
index 000000000000..51d2ef91905a
--- /dev/null
+++ b/drivers/infiniband/hw/erdma/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_INFINIBAND_ERDMA) := erdma.o
+
+erdma-y := erdma_cm.o erdma_main.o erdma_cmdq.o erdma_cq.o erdma_verbs.o erdma_qp.o erdma_eq.o
diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h
new file mode 100644
index 000000000000..2aae635c1c8d
--- /dev/null
+++ b/drivers/infiniband/hw/erdma/erdma.h
@@ -0,0 +1,287 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+
+/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
+/*          Kai Shen <kaishen@linux.alibaba.com> */
+/* Copyright (c) 2020-2022, Alibaba Group. */
+
+#ifndef __ERDMA_H__
+#define __ERDMA_H__
+
+#include <linux/bitfield.h>
+#include <linux/netdevice.h>
+#include <linux/xarray.h>
+#include <rdma/ib_verbs.h>
+
+#include "erdma_hw.h"
+
+#define DRV_MODULE_NAME "erdma"
+#define ERDMA_NODE_DESC "Elastic RDMA(iWARP) stack"
+
+struct erdma_eq {
+	void *qbuf;
+	dma_addr_t qbuf_dma_addr;
+
+	spinlock_t lock;
+
+	u32 depth;
+
+	u16 ci;
+	u16 rsvd;
+
+	atomic64_t event_num;
+	atomic64_t notify_num;
+
+	u64 __iomem *db_addr;
+	u64 *db_record;
+};
+
+struct erdma_cmdq_sq {
+	void *qbuf;
+	dma_addr_t qbuf_dma_addr;
+
+	spinlock_t lock;
+
+	u32 depth;
+	u16 ci;
+	u16 pi;
+
+	u16 wqebb_cnt;
+
+	u64 *db_record;
+};
+
+struct erdma_cmdq_cq {
+	void *qbuf;
+	dma_addr_t qbuf_dma_addr;
+
+	spinlock_t lock;
+
+	u32 depth;
+	u32 ci;
+	u32 cmdsn;
+
+	u64 *db_record;
+
+	atomic64_t armed_num;
+};
+
+enum {
+	ERDMA_CMD_STATUS_INIT,
+	ERDMA_CMD_STATUS_ISSUED,
+	ERDMA_CMD_STATUS_FINISHED,
+	ERDMA_CMD_STATUS_TIMEOUT
+};
+
+struct erdma_comp_wait {
+	struct completion wait_event;
+	u32 cmd_status;
+	u32 ctx_id;
+	u16 sq_pi;
+	u8 comp_status;
+	u8 rsvd;
+	u32 comp_data[4];
+};
+
+enum {
+	ERDMA_CMDQ_STATE_OK_BIT = 0,
+	ERDMA_CMDQ_STATE_TIMEOUT_BIT = 1,
+	ERDMA_CMDQ_STATE_CTX_ERR_BIT = 2,
+};
+
+#define ERDMA_CMDQ_TIMEOUT_MS 15000
+#define ERDMA_REG_ACCESS_WAIT_MS 20
+#define ERDMA_WAIT_DEV_DONE_CNT 500
+
+struct erdma_cmdq {
+	unsigned long *comp_wait_bitmap;
+	struct erdma_comp_wait *wait_pool;
+	spinlock_t lock;
+
+	bool use_event;
+
+	struct erdma_cmdq_sq sq;
+	struct erdma_cmdq_cq cq;
+	struct erdma_eq eq;
+
+	unsigned long state;
+
+	struct semaphore credits;
+	u16 max_outstandings;
+};
+
+#define COMPROMISE_CC ERDMA_CC_CUBIC
+enum erdma_cc_alg {
+	ERDMA_CC_NEWRENO = 0,
+	ERDMA_CC_CUBIC,
+	ERDMA_CC_HPCC_RTT,
+	ERDMA_CC_HPCC_ECN,
+	ERDMA_CC_HPCC_INT,
+	ERDMA_CC_METHODS_NUM
+};
+
+struct erdma_devattr {
+	u32 fw_version;
+
+	unsigned char peer_addr[ETH_ALEN];
+
+	int numa_node;
+	enum erdma_cc_alg cc;
+	u32 grp_num;
+	u32 irq_num;
+
+	bool disable_dwqe;
+	u16 dwqe_pages;
+	u16 dwqe_entries;
+
+	u32 max_qp;
+	u32 max_send_wr;
+	u32 max_recv_wr;
+	u32 max_ord;
+	u32 max_ird;
+
+	u32 max_send_sge;
+	u32 max_recv_sge;
+	u32 max_sge_rd;
+	u32 max_cq;
+	u32 max_cqe;
+	u64 max_mr_size;
+	u32 max_mr;
+	u32 max_pd;
+	u32 max_mw;
+	u32 local_dma_key;
+};
+
+#define ERDMA_IRQNAME_SIZE 50
+
+struct erdma_irq {
+	char name[ERDMA_IRQNAME_SIZE];
+	u32 msix_vector;
+	cpumask_t affinity_hint_mask;
+};
+
+struct erdma_eq_cb {
+	bool ready;
+	void *dev; /* All EQs use this fields to get erdma_dev struct */
+	struct erdma_irq irq;
+	struct erdma_eq eq;
+	struct tasklet_struct tasklet;
+};
+
+struct erdma_resource_cb {
+	unsigned long *bitmap;
+	spinlock_t lock;
+	u32 next_alloc_idx;
+	u32 max_cap;
+};
+
+enum {
+	ERDMA_RES_TYPE_PD = 0,
+	ERDMA_RES_TYPE_STAG_IDX = 1,
+	ERDMA_RES_CNT = 2,
+};
+
+#define ERDMA_EXTRA_BUFFER_SIZE ERDMA_DB_SIZE
+#define WARPPED_BUFSIZE(size) ((size) + ERDMA_EXTRA_BUFFER_SIZE)
+
+struct erdma_dev {
+	struct ib_device ibdev;
+	struct net_device *netdev;
+	struct pci_dev *pdev;
+	struct notifier_block netdev_nb;
+
+	resource_size_t func_bar_addr;
+	resource_size_t func_bar_len;
+	u8 __iomem *func_bar;
+
+	struct erdma_devattr attrs;
+	/* physical port state (only one port per device) */
+	enum ib_port_state state;
+
+	/* cmdq and aeq use the same msix vector */
+	struct erdma_irq comm_irq;
+	struct erdma_cmdq cmdq;
+	struct erdma_eq aeq;
+	struct erdma_eq_cb ceqs[ERDMA_NUM_MSIX_VEC - 1];
+
+	spinlock_t lock;
+	struct erdma_resource_cb res_cb[ERDMA_RES_CNT];
+	struct xarray qp_xa;
+	struct xarray cq_xa;
+
+	u32 next_alloc_qpn;
+	u32 next_alloc_cqn;
+
+	spinlock_t db_bitmap_lock;
+	/* We provide max 64 uContexts that each has one SQ doorbell Page. */
+	DECLARE_BITMAP(sdb_page, ERDMA_DWQE_TYPE0_CNT);
+	/*
+	 * We provide max 496 uContexts that each has one SQ normal Db,
+	 * and one directWQE db。
+	 */
+	DECLARE_BITMAP(sdb_entry, ERDMA_DWQE_TYPE1_CNT);
+
+	atomic_t num_ctx;
+	struct list_head cep_list;
+};
+
+static inline void *get_queue_entry(void *qbuf, u32 idx, u32 depth, u32 shift)
+{
+	idx &= (depth - 1);
+
+	return qbuf + (idx << shift);
+}
+
+static inline struct erdma_dev *to_edev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct erdma_dev, ibdev);
+}
+
+static inline u32 erdma_reg_read32(struct erdma_dev *dev, u32 reg)
+{
+	return readl(dev->func_bar + reg);
+}
+
+static inline u64 erdma_reg_read64(struct erdma_dev *dev, u32 reg)
+{
+	return readq(dev->func_bar + reg);
+}
+
+static inline void erdma_reg_write32(struct erdma_dev *dev, u32 reg, u32 value)
+{
+	writel(value, dev->func_bar + reg);
+}
+
+static inline void erdma_reg_write64(struct erdma_dev *dev, u32 reg, u64 value)
+{
+	writeq(value, dev->func_bar + reg);
+}
+
+static inline u32 erdma_reg_read32_filed(struct erdma_dev *dev, u32 reg,
+					 u32 filed_mask)
+{
+	u32 val = erdma_reg_read32(dev, reg);
+
+	return FIELD_GET(filed_mask, val);
+}
+
+int erdma_cmdq_init(struct erdma_dev *dev);
+void erdma_finish_cmdq_init(struct erdma_dev *dev);
+void erdma_cmdq_destroy(struct erdma_dev *dev);
+
+void erdma_cmdq_build_reqhdr(u64 *hdr, u32 mod, u32 op);
+int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, u64 *req, u32 req_size,
+			u64 *resp0, u64 *resp1);
+void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq);
+
+int erdma_ceqs_init(struct erdma_dev *dev);
+void erdma_ceqs_uninit(struct erdma_dev *dev);
+void notify_eq(struct erdma_eq *eq);
+void *get_next_valid_eqe(struct erdma_eq *eq);
+
+int erdma_aeq_init(struct erdma_dev *dev);
+void erdma_aeq_destroy(struct erdma_dev *dev);
+
+void erdma_aeq_event_handler(struct erdma_dev *dev);
+void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb);
+
+#endif
diff --git a/drivers/infiniband/hw/erdma/erdma_cm.c b/drivers/infiniband/hw/erdma/erdma_cm.c
new file mode 100644
index 000000000000..f13f16479eca
--- /dev/null
+++ b/drivers/infiniband/hw/erdma/erdma_cm.c
@@ -0,0 +1,1430 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
+/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
+/*          Kai Shen <kaishen@linux.alibaba.com> */
+/* Copyright (c) 2020-2022, Alibaba Group. */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/*          Fredy Neeser */
+/*          Greg Joyce <greg@opengridcomputing.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+/* Copyright (c) 2017, Open Grid Computing, Inc. */
+
+#include <linux/errno.h>
+#include <linux/inetdevice.h>
+#include <linux/net.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <net/addrconf.h>
+
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+
+#include "erdma.h"
+#include "erdma_cm.h"
+#include "erdma_verbs.h"
+
+static struct workqueue_struct *erdma_cm_wq;
+
+static void erdma_cm_llp_state_change(struct sock *sk);
+static void erdma_cm_llp_data_ready(struct sock *sk);
+static void erdma_cm_llp_error_report(struct sock *sk);
+
+static void erdma_sk_assign_cm_upcalls(struct sock *sk)
+{
+	write_lock_bh(&sk->sk_callback_lock);
+	sk->sk_state_change = erdma_cm_llp_state_change;
+	sk->sk_data_ready = erdma_cm_llp_data_ready;
+	sk->sk_error_report = erdma_cm_llp_error_report;
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void erdma_sk_save_upcalls(struct sock *sk)
+{
+	struct erdma_cep *cep = sk_to_cep(sk);
+
+	write_lock_bh(&sk->sk_callback_lock);
+	cep->sk_state_change = sk->sk_state_change;
+	cep->sk_data_ready = sk->sk_data_ready;
+	cep->sk_error_report = sk->sk_error_report;
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void erdma_sk_restore_upcalls(struct sock *sk, struct erdma_cep *cep)
+{
+	sk->sk_state_change = cep->sk_state_change;
+	sk->sk_data_ready = cep->sk_data_ready;
+	sk->sk_error_report = cep->sk_error_report;
+	sk->sk_user_data = NULL;
+}
+
+static void erdma_socket_disassoc(struct socket *s)
+{
+	struct sock *sk = s->sk;
+	struct erdma_cep *cep;
+
+	if (sk) {
+		write_lock_bh(&sk->sk_callback_lock);
+		cep = sk_to_cep(sk);
+		if (cep) {
+			erdma_sk_restore_upcalls(sk, cep);
+			erdma_cep_put(cep);
+		} else {
+			WARN_ON_ONCE(1);
+		}
+		write_unlock_bh(&sk->sk_callback_lock);
+	} else {
+		WARN_ON_ONCE(1);
+	}
+}
+
+static void erdma_cep_socket_assoc(struct erdma_cep *cep, struct socket *s)
+{
+	cep->sock = s;
+	erdma_cep_get(cep);
+	s->sk->sk_user_data = cep;
+
+	erdma_sk_save_upcalls(s->sk);
+	erdma_sk_assign_cm_upcalls(s->sk);
+}
+
+static void erdma_disassoc_listen_cep(struct erdma_cep *cep)
+{
+	if (cep->listen_cep) {
+		erdma_cep_put(cep->listen_cep);
+		cep->listen_cep = NULL;
+	}
+}
+
+static struct erdma_cep *erdma_cep_alloc(struct erdma_dev *dev)
+{
+	struct erdma_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL);
+	unsigned long flags;
+
+	if (!cep)
+		return NULL;
+
+	INIT_LIST_HEAD(&cep->listenq);
+	INIT_LIST_HEAD(&cep->devq);
+	INIT_LIST_HEAD(&cep->work_freelist);
+
+	kref_init(&cep->ref);
+	cep->state = ERDMA_EPSTATE_IDLE;
+	init_waitqueue_head(&cep->waitq);
+	spin_lock_init(&cep->lock);
+	cep->dev = dev;
+
+	spin_lock_irqsave(&dev->lock, flags);
+	list_add_tail(&cep->devq, &dev->cep_list);
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	return cep;
+}
+
+static void erdma_cm_free_work(struct erdma_cep *cep)
+{
+	struct list_head *w, *tmp;
+	struct erdma_cm_work *work;
+
+	list_for_each_safe(w, tmp, &cep->work_freelist) {
+		work = list_entry(w, struct erdma_cm_work, list);
+		list_del(&work->list);
+		kfree(work);
+	}
+}
+
+static void erdma_cancel_mpatimer(struct erdma_cep *cep)
+{
+	spin_lock_bh(&cep->lock);
+	if (cep->mpa_timer) {
+		if (cancel_delayed_work(&cep->mpa_timer->work)) {
+			erdma_cep_put(cep);
+			kfree(cep->mpa_timer);
+		}
+		cep->mpa_timer = NULL;
+	}
+	spin_unlock_bh(&cep->lock);
+}
+
+static void erdma_put_work(struct erdma_cm_work *work)
+{
+	INIT_LIST_HEAD(&work->list);
+	spin_lock_bh(&work->cep->lock);
+	list_add(&work->list, &work->cep->work_freelist);
+	spin_unlock_bh(&work->cep->lock);
+}
+
+static void erdma_cep_set_inuse(struct erdma_cep *cep)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cep->lock, flags);
+	while (cep->in_use) {
+		spin_unlock_irqrestore(&cep->lock, flags);
+		wait_event_interruptible(cep->waitq, !cep->in_use);
+		if (signal_pending(current))
+			flush_signals(current);
+
+		spin_lock_irqsave(&cep->lock, flags);
+	}
+
+	cep->in_use = 1;
+	spin_unlock_irqrestore(&cep->lock, flags);
+}
+
+static void erdma_cep_set_free(struct erdma_cep *cep)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cep->lock, flags);
+	cep->in_use = 0;
+	spin_unlock_irqrestore(&cep->lock, flags);
+
+	wake_up(&cep->waitq);
+}
+
+static void __erdma_cep_dealloc(struct kref *ref)
+{
+	struct erdma_cep *cep = container_of(ref, struct erdma_cep, ref);
+	struct erdma_dev *dev = cep->dev;
+	unsigned long flags;
+
+	WARN_ON(cep->listen_cep);
+
+	kfree(cep->private_data);
+	kfree(cep->mpa.pdata);
+	spin_lock_bh(&cep->lock);
+	if (!list_empty(&cep->work_freelist))
+		erdma_cm_free_work(cep);
+	spin_unlock_bh(&cep->lock);
+
+	spin_lock_irqsave(&dev->lock, flags);
+	list_del(&cep->devq);
+	spin_unlock_irqrestore(&dev->lock, flags);
+	kfree(cep);
+}
+
+static struct erdma_cm_work *erdma_get_work(struct erdma_cep *cep)
+{
+	struct erdma_cm_work *work = NULL;
+
+	spin_lock_bh(&cep->lock);
+	if (!list_empty(&cep->work_freelist)) {
+		work = list_entry(cep->work_freelist.next, struct erdma_cm_work,
+				  list);
+		list_del_init(&work->list);
+	}
+
+	spin_unlock_bh(&cep->lock);
+	return work;
+}
+
+static int erdma_cm_alloc_work(struct erdma_cep *cep, int num)
+{
+	struct erdma_cm_work *work;
+
+	while (num--) {
+		work = kmalloc(sizeof(*work), GFP_KERNEL);
+		if (!work) {
+			if (!(list_empty(&cep->work_freelist)))
+				erdma_cm_free_work(cep);
+			return -ENOMEM;
+		}
+		work->cep = cep;
+		INIT_LIST_HEAD(&work->list);
+		list_add(&work->list, &cep->work_freelist);
+	}
+
+	return 0;
+}
+
+static int erdma_cm_upcall(struct erdma_cep *cep, enum iw_cm_event_type reason,
+			   int status)
+{
+	struct iw_cm_event event;
+	struct iw_cm_id *cm_id;
+
+	memset(&event, 0, sizeof(event));
+	event.status = status;
+	event.event = reason;
+
+	if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
+		event.provider_data = cep;
+		cm_id = cep->listen_cep->cm_id;
+
+		event.ird = cep->dev->attrs.max_ird;
+		event.ord = cep->dev->attrs.max_ord;
+	} else {
+		cm_id = cep->cm_id;
+	}
+
+	if (reason == IW_CM_EVENT_CONNECT_REQUEST ||
+	    reason == IW_CM_EVENT_CONNECT_REPLY) {
+		u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len);
+
+		if (pd_len && cep->mpa.pdata) {
+			event.private_data_len = pd_len;
+			event.private_data = cep->mpa.pdata;
+		}
+
+		getname_local(cep->sock, &event.local_addr);
+		getname_peer(cep->sock, &event.remote_addr);
+	}
+
+	return cm_id->event_handler(cm_id, &event);
+}
+
+void erdma_qp_cm_drop(struct erdma_qp *qp)
+{
+	struct erdma_cep *cep = qp->cep;
+
+	if (!qp->cep)
+		return;
+
+	erdma_cep_set_inuse(cep);
+
+	/* already closed. */
+	if (cep->state == ERDMA_EPSTATE_CLOSED)
+		goto out;
+
+	if (cep->cm_id) {
+		switch (cep->state) {
+		case ERDMA_EPSTATE_AWAIT_MPAREP:
+			erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+					-EINVAL);
+			break;
+		case ERDMA_EPSTATE_RDMA_MODE:
+			erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
+			break;
+		case ERDMA_EPSTATE_IDLE:
+		case ERDMA_EPSTATE_LISTENING:
+		case ERDMA_EPSTATE_CONNECTING:
+		case ERDMA_EPSTATE_AWAIT_MPAREQ:
+		case ERDMA_EPSTATE_RECVD_MPAREQ:
+		case ERDMA_EPSTATE_CLOSED:
+		default:
+			break;
+		}
+		cep->cm_id->rem_ref(cep->cm_id);
+		cep->cm_id = NULL;
+		erdma_cep_put(cep);
+	}
+	cep->state = ERDMA_EPSTATE_CLOSED;
+
+	if (cep->sock) {
+		erdma_socket_disassoc(cep->sock);
+		sock_release(cep->sock);
+		cep->sock = NULL;
+	}
+
+	if (cep->qp) {
+		cep->qp = NULL;
+		erdma_qp_put(qp);
+	}
+out:
+	erdma_cep_set_free(cep);
+}
+
+void erdma_cep_put(struct erdma_cep *cep)
+{
+	WARN_ON(kref_read(&cep->ref) < 1);
+	kref_put(&cep->ref, __erdma_cep_dealloc);
+}
+
+void erdma_cep_get(struct erdma_cep *cep)
+{
+	kref_get(&cep->ref);
+}
+
+static int erdma_send_mpareqrep(struct erdma_cep *cep, const void *pdata,
+				u8 pd_len)
+{
+	struct socket *s = cep->sock;
+	struct mpa_rr *rr = &cep->mpa.hdr;
+	struct kvec iov[3];
+	struct msghdr msg;
+	int iovec_num = 0;
+	int ret;
+	int mpa_len;
+
+	memset(&msg, 0, sizeof(msg));
+
+	rr->params.pd_len = cpu_to_be16(pd_len);
+
+	iov[iovec_num].iov_base = rr;
+	iov[iovec_num].iov_len = sizeof(*rr);
+	iovec_num++;
+	mpa_len = sizeof(*rr);
+
+	iov[iovec_num].iov_base = &cep->mpa.ext_data;
+	iov[iovec_num].iov_len = sizeof(cep->mpa.ext_data);
+	iovec_num++;
+	mpa_len += sizeof(cep->mpa.ext_data);
+
+	if (pd_len) {
+		iov[iovec_num].iov_base = (char *)pdata;
+		iov[iovec_num].iov_len = pd_len;
+		mpa_len += pd_len;
+		iovec_num++;
+	}
+
+	ret = kernel_sendmsg(s, &msg, iov, iovec_num, mpa_len);
+
+	return ret < 0 ? ret : 0;
+}
+
+static inline int ksock_recv(struct socket *sock, char *buf, size_t size,
+			     int flags)
+{
+	struct kvec iov = { buf, size };
+	struct msghdr msg = { .msg_name = NULL, .msg_flags = flags };
+
+	return kernel_recvmsg(sock, &msg, &iov, 1, size, flags);
+}
+
+static int __recv_mpa_hdr(struct erdma_cep *cep, int hdr_rcvd, char *hdr,
+			  int hdr_size, int *rcvd_out)
+{
+	struct socket *s = cep->sock;
+	int rcvd;
+
+	*rcvd_out = 0;
+	if (hdr_rcvd < hdr_size) {
+		rcvd = ksock_recv(s, hdr + hdr_rcvd, hdr_size - hdr_rcvd,
+				  MSG_DONTWAIT);
+		if (rcvd == -EAGAIN)
+			return -EAGAIN;
+
+		if (rcvd <= 0)
+			return -ECONNABORTED;
+
+		hdr_rcvd += rcvd;
+		*rcvd_out = rcvd;
+
+		if (hdr_rcvd < hdr_size)
+			return -EAGAIN;
+	}
+
+	return 0;
+}
+
+static void __mpa_rr_set_revision(__be16 *bits, u8 rev)
+{
+	*bits = (*bits & ~MPA_RR_MASK_REVISION) |
+		(cpu_to_be16(rev) & MPA_RR_MASK_REVISION);
+}
+
+static u8 __mpa_rr_revision(__be16 mpa_rr_bits)
+{
+	__be16 rev = mpa_rr_bits & MPA_RR_MASK_REVISION;
+
+	return (u8)be16_to_cpu(rev);
+}
+
+static void __mpa_ext_set_cc(__be32 *bits, u32 cc)
+{
+	*bits = (*bits & ~MPA_EXT_FLAG_CC) |
+		(cpu_to_be32(cc) & MPA_EXT_FLAG_CC);
+}
+
+static u8 __mpa_ext_cc(__be32 mpa_ext_bits)
+{
+	__be32 cc = mpa_ext_bits & MPA_EXT_FLAG_CC;
+
+	return (u8)be32_to_cpu(cc);
+}
+
+/*
+ * Receive MPA Request/Reply header.
+ *
+ * Returns 0 if complete MPA Request/Reply haeder including
+ * eventual private data was received. Returns -EAGAIN if
+ * header was partially received or negative error code otherwise.
+ *
+ * Context: May be called in process context only
+ */
+static int erdma_recv_mpa_rr(struct erdma_cep *cep)
+{
+	struct mpa_rr *hdr = &cep->mpa.hdr;
+	struct socket *s = cep->sock;
+	u16 pd_len;
+	int rcvd, to_rcv, ret, pd_rcvd;
+
+	if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) {
+		ret = __recv_mpa_hdr(cep, cep->mpa.bytes_rcvd,
+				     (char *)&cep->mpa.hdr,
+				     sizeof(struct mpa_rr), &rcvd);
+		cep->mpa.bytes_rcvd += rcvd;
+		if (ret)
+			return ret;
+	}
+
+	if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA ||
+	    __mpa_rr_revision(hdr->params.bits) != MPA_REVISION_EXT_1)
+		return -EPROTO;
+
+	if (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr) <
+	    sizeof(struct erdma_mpa_ext)) {
+		ret = __recv_mpa_hdr(
+			cep, cep->mpa.bytes_rcvd - sizeof(struct mpa_rr),
+			(char *)&cep->mpa.ext_data,
+			sizeof(struct erdma_mpa_ext), &rcvd);
+		cep->mpa.bytes_rcvd += rcvd;
+		if (ret)
+			return ret;
+	}
+
+	pd_len = be16_to_cpu(hdr->params.pd_len);
+	pd_rcvd = cep->mpa.bytes_rcvd - sizeof(struct mpa_rr) -
+		  sizeof(struct erdma_mpa_ext);
+	to_rcv = pd_len - pd_rcvd;
+
+	if (!to_rcv) {
+		/*
+		 * We have received the whole MPA Request/Reply message.
+		 * Check against peer protocol violation.
+		 */
+		u32 word;
+
+		ret = __recv_mpa_hdr(cep, 0, (char *)&word, sizeof(word),
+				     &rcvd);
+		if (ret == -EAGAIN && rcvd == 0)
+			return 0;
+
+		if (ret)
+			return ret;
+
+		return -EPROTO;
+	}
+
+	/*
+	 * At this point, MPA header has been fully received, and pd_len != 0.
+	 * So, begin to receive private data.
+	 */
+	if (!cep->mpa.pdata) {
+		cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL);
+		if (!cep->mpa.pdata)
+			return -ENOMEM;
+	}
+
+	rcvd = ksock_recv(s, cep->mpa.pdata + pd_rcvd, to_rcv + 4,
+			  MSG_DONTWAIT);
+	if (rcvd < 0)
+		return rcvd;
+
+	if (rcvd > to_rcv)
+		return -EPROTO;
+
+	cep->mpa.bytes_rcvd += rcvd;
+
+	if (to_rcv == rcvd)
+		return 0;
+
+	return -EAGAIN;
+}
+
+/*
+ * erdma_proc_mpareq()
+ *
+ * Read MPA Request from socket and signal new connection to IWCM
+ * if success. Caller must hold lock on corresponding listening CEP.
+ */
+static int erdma_proc_mpareq(struct erdma_cep *cep)
+{
+	struct mpa_rr *req;
+	int ret;
+
+	ret = erdma_recv_mpa_rr(cep);
+	if (ret)
+		return ret;
+
+	req = &cep->mpa.hdr;
+
+	if (memcmp(req->key, MPA_KEY_REQ, MPA_KEY_SIZE))
+		return -EPROTO;
+
+	memcpy(req->key, MPA_KEY_REP, MPA_KEY_SIZE);
+
+	/* Currently does not support marker and crc. */
+	if (req->params.bits & MPA_RR_FLAG_MARKERS ||
+	    req->params.bits & MPA_RR_FLAG_CRC)
+		goto reject_conn;
+
+	cep->state = ERDMA_EPSTATE_RECVD_MPAREQ;
+
+	/* Keep reference until IWCM accepts/rejects */
+	erdma_cep_get(cep);
+	ret = erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0);
+	if (ret)
+		erdma_cep_put(cep);
+
+	return ret;
+
+reject_conn:
+	req->params.bits &= ~MPA_RR_FLAG_MARKERS;
+	req->params.bits |= MPA_RR_FLAG_REJECT;
+	req->params.bits &= ~MPA_RR_FLAG_CRC;
+
+	kfree(cep->mpa.pdata);
+	cep->mpa.pdata = NULL;
+	erdma_send_mpareqrep(cep, NULL, 0);
+
+	return -EOPNOTSUPP;
+}
+
+static int erdma_proc_mpareply(struct erdma_cep *cep)
+{
+	struct erdma_qp_attrs qp_attrs;
+	struct erdma_qp *qp = cep->qp;
+	struct mpa_rr *rep;
+	int ret;
+
+	ret = erdma_recv_mpa_rr(cep);
+	if (ret)
+		goto out_err;
+
+	erdma_cancel_mpatimer(cep);
+
+	rep = &cep->mpa.hdr;
+
+	if (memcmp(rep->key, MPA_KEY_REP, MPA_KEY_SIZE)) {
+		ret = -EPROTO;
+		goto out_err;
+	}
+
+	if (rep->params.bits & MPA_RR_FLAG_REJECT) {
+		erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET);
+		return -ECONNRESET;
+	}
+
+	/* Currently does not support marker and crc. */
+	if ((rep->params.bits & MPA_RR_FLAG_MARKERS) ||
+	    (rep->params.bits & MPA_RR_FLAG_CRC)) {
+		erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED);
+		return -EINVAL;
+	}
+
+	memset(&qp_attrs, 0, sizeof(qp_attrs));
+	qp_attrs.irq_size = cep->ird;
+	qp_attrs.orq_size = cep->ord;
+	qp_attrs.state = ERDMA_QP_STATE_RTS;
+
+	down_write(&qp->state_lock);
+	if (qp->attrs.state > ERDMA_QP_STATE_RTR) {
+		ret = -EINVAL;
+		up_write(&qp->state_lock);
+		goto out_err;
+	}
+
+	qp->attrs.qp_type = ERDMA_QP_ACTIVE;
+	if (__mpa_ext_cc(cep->mpa.ext_data.bits) != qp->attrs.cc)
+		qp->attrs.cc = COMPROMISE_CC;
+
+	ret = erdma_modify_qp_internal(qp, &qp_attrs,
+				       ERDMA_QP_ATTR_STATE |
+				       ERDMA_QP_ATTR_LLP_HANDLE |
+				       ERDMA_QP_ATTR_MPA);
+
+	up_write(&qp->state_lock);
+
+	if (!ret) {
+		ret = erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0);
+		if (!ret)
+			cep->state = ERDMA_EPSTATE_RDMA_MODE;
+
+		return 0;
+	}
+
+out_err:
+	if (ret != -EAGAIN)
+		erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL);
+
+	return ret;
+}
+
+static void erdma_accept_newconn(struct erdma_cep *cep)
+{
+	struct socket *s = cep->sock;
+	struct socket *new_s = NULL;
+	struct erdma_cep *new_cep = NULL;
+	int ret = 0;
+
+	if (cep->state != ERDMA_EPSTATE_LISTENING)
+		goto error;
+
+	new_cep = erdma_cep_alloc(cep->dev);
+	if (!new_cep)
+		goto error;
+
+	/*
+	 * 4: Allocate a sufficient number of work elements
+	 * to allow concurrent handling of local + peer close
+	 * events, MPA header processing + MPA timeout.
+	 */
+	if (erdma_cm_alloc_work(new_cep, 4) != 0)
+		goto error;
+
+	/*
+	 * Copy saved socket callbacks from listening CEP
+	 * and assign new socket with new CEP
+	 */
+	new_cep->sk_state_change = cep->sk_state_change;
+	new_cep->sk_data_ready = cep->sk_data_ready;
+	new_cep->sk_error_report = cep->sk_error_report;
+
+	ret = kernel_accept(s, &new_s, O_NONBLOCK);
+	if (ret != 0)
+		goto error;
+
+	new_cep->sock = new_s;
+	erdma_cep_get(new_cep);
+	new_s->sk->sk_user_data = new_cep;
+
+	tcp_sock_set_nodelay(new_s->sk);
+	new_cep->state = ERDMA_EPSTATE_AWAIT_MPAREQ;
+
+	ret = erdma_cm_queue_work(new_cep, ERDMA_CM_WORK_MPATIMEOUT);
+	if (ret)
+		goto error;
+
+	new_cep->listen_cep = cep;
+	erdma_cep_get(cep);
+
+	if (atomic_read(&new_s->sk->sk_rmem_alloc)) {
+		/* MPA REQ already queued */
+		erdma_cep_set_inuse(new_cep);
+		ret = erdma_proc_mpareq(new_cep);
+		if (ret != -EAGAIN) {
+			erdma_cep_put(cep);
+			new_cep->listen_cep = NULL;
+			if (ret) {
+				erdma_cep_set_free(new_cep);
+				goto error;
+			}
+		}
+		erdma_cep_set_free(new_cep);
+	}
+	return;
+
+error:
+	if (new_cep) {
+		new_cep->state = ERDMA_EPSTATE_CLOSED;
+		erdma_cancel_mpatimer(new_cep);
+
+		erdma_cep_put(new_cep);
+		new_cep->sock = NULL;
+	}
+
+	if (new_s) {
+		erdma_socket_disassoc(new_s);
+		sock_release(new_s);
+	}
+}
+
+static int erdma_newconn_connected(struct erdma_cep *cep)
+{
+	int ret = 0;
+
+	cep->mpa.hdr.params.bits = 0;
+	__mpa_rr_set_revision(&cep->mpa.hdr.params.bits, MPA_REVISION_EXT_1);
+
+	memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, MPA_KEY_SIZE);
+	cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie);
+	__mpa_ext_set_cc(&cep->mpa.ext_data.bits, cep->qp->attrs.cc);
+
+	ret = erdma_send_mpareqrep(cep, cep->private_data, cep->pd_len);
+	cep->state = ERDMA_EPSTATE_AWAIT_MPAREP;
+	cep->mpa.hdr.params.pd_len = 0;
+
+	if (ret >= 0)
+		ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_MPATIMEOUT);
+
+	return ret;
+}
+
+static void erdma_cm_work_handler(struct work_struct *w)
+{
+	struct erdma_cm_work *work;
+	struct erdma_cep *cep;
+	int release_cep = 0, ret = 0;
+
+	work = container_of(w, struct erdma_cm_work, work.work);
+	cep = work->cep;
+
+	erdma_cep_set_inuse(cep);
+
+	switch (work->type) {
+	case ERDMA_CM_WORK_CONNECTED:
+		erdma_cancel_mpatimer(cep);
+		if (cep->state == ERDMA_EPSTATE_CONNECTING) {
+			ret = erdma_newconn_connected(cep);
+			if (ret) {
+				erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+						-EIO);
+				release_cep = 1;
+			}
+		}
+		break;
+	case ERDMA_CM_WORK_CONNECTTIMEOUT:
+		if (cep->state == ERDMA_EPSTATE_CONNECTING) {
+			cep->mpa_timer = NULL;
+			erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+					-ETIMEDOUT);
+			release_cep = 1;
+		}
+		break;
+	case ERDMA_CM_WORK_ACCEPT:
+		erdma_accept_newconn(cep);
+		break;
+	case ERDMA_CM_WORK_READ_MPAHDR:
+		if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) {
+			if (cep->listen_cep) {
+				erdma_cep_set_inuse(cep->listen_cep);
+
+				if (cep->listen_cep->state ==
+				    ERDMA_EPSTATE_LISTENING)
+					ret = erdma_proc_mpareq(cep);
+				else
+					ret = -EFAULT;
+
+				erdma_cep_set_free(cep->listen_cep);
+
+				if (ret != -EAGAIN) {
+					erdma_cep_put(cep->listen_cep);
+					cep->listen_cep = NULL;
+					if (ret)
+						erdma_cep_put(cep);
+				}
+			}
+		} else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) {
+			ret = erdma_proc_mpareply(cep);
+		}
+
+		if (ret && ret != -EAGAIN)
+			release_cep = 1;
+		break;
+	case ERDMA_CM_WORK_CLOSE_LLP:
+		if (cep->cm_id)
+			erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
+		release_cep = 1;
+		break;
+	case ERDMA_CM_WORK_PEER_CLOSE:
+		if (cep->cm_id) {
+			if (cep->state == ERDMA_EPSTATE_CONNECTING ||
+			    cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) {
+				/*
+				 * MPA reply not received, but connection drop
+				 */
+				erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+						-ECONNRESET);
+			} else if (cep->state == ERDMA_EPSTATE_RDMA_MODE) {
+				/*
+				 * NOTE: IW_CM_EVENT_DISCONNECT is given just
+				 *       to transition IWCM into CLOSING.
+				 */
+				erdma_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0);
+				erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
+			}
+		} else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) {
+			/* Socket close before MPA request received. */
+			erdma_disassoc_listen_cep(cep);
+			erdma_cep_put(cep);
+		}
+		release_cep = 1;
+		break;
+	case ERDMA_CM_WORK_MPATIMEOUT:
+		cep->mpa_timer = NULL;
+		if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) {
+			/*
+			 * MPA request timed out:
+			 * Hide any partially received private data and signal
+			 * timeout
+			 */
+			cep->mpa.hdr.params.pd_len = 0;
+
+			if (cep->cm_id)
+				erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+						-ETIMEDOUT);
+			release_cep = 1;
+		} else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) {
+			/* No MPA req received after peer TCP stream setup. */
+			erdma_disassoc_listen_cep(cep);
+
+			erdma_cep_put(cep);
+			release_cep = 1;
+		}
+		break;
+	default:
+		WARN(1, "Undefined CM work type: %d\n", work->type);
+	}
+
+	if (release_cep) {
+		erdma_cancel_mpatimer(cep);
+		cep->state = ERDMA_EPSTATE_CLOSED;
+		if (cep->qp) {
+			struct erdma_qp *qp = cep->qp;
+			/*
+			 * Serialize a potential race with application
+			 * closing the QP and calling erdma_qp_cm_drop()
+			 */
+			erdma_qp_get(qp);
+			erdma_cep_set_free(cep);
+
+			erdma_qp_llp_close(qp);
+			erdma_qp_put(qp);
+
+			erdma_cep_set_inuse(cep);
+			cep->qp = NULL;
+			erdma_qp_put(qp);
+		}
+
+		if (cep->sock) {
+			erdma_socket_disassoc(cep->sock);
+			sock_release(cep->sock);
+			cep->sock = NULL;
+		}
+
+		if (cep->cm_id) {
+			cep->cm_id->rem_ref(cep->cm_id);
+			cep->cm_id = NULL;
+			if (cep->state != ERDMA_EPSTATE_LISTENING)
+				erdma_cep_put(cep);
+		}
+	}
+	erdma_cep_set_free(cep);
+	erdma_put_work(work);
+	erdma_cep_put(cep);
+}
+
+int erdma_cm_queue_work(struct erdma_cep *cep, enum erdma_work_type type)
+{
+	struct erdma_cm_work *work = erdma_get_work(cep);
+	unsigned long delay = 0;
+
+	if (!work)
+		return -ENOMEM;
+
+	work->type = type;
+	work->cep = cep;
+
+	erdma_cep_get(cep);
+
+	INIT_DELAYED_WORK(&work->work, erdma_cm_work_handler);
+
+	if (type == ERDMA_CM_WORK_MPATIMEOUT) {
+		cep->mpa_timer = work;
+
+		if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP)
+			delay = MPAREP_TIMEOUT;
+		else
+			delay = MPAREQ_TIMEOUT;
+	} else if (type == ERDMA_CM_WORK_CONNECTTIMEOUT) {
+		cep->mpa_timer = work;
+
+		delay = CONNECT_TIMEOUT;
+	}
+
+	queue_delayed_work(erdma_cm_wq, &work->work, delay);
+
+	return 0;
+}
+
+static void erdma_cm_llp_data_ready(struct sock *sk)
+{
+	struct erdma_cep *cep;
+
+	read_lock(&sk->sk_callback_lock);
+
+	cep = sk_to_cep(sk);
+	if (!cep)
+		goto out;
+
+	if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ ||
+	    cep->state == ERDMA_EPSTATE_AWAIT_MPAREP)
+		erdma_cm_queue_work(cep, ERDMA_CM_WORK_READ_MPAHDR);
+
+out:
+	read_unlock(&sk->sk_callback_lock);
+}
+
+static void erdma_cm_llp_error_report(struct sock *sk)
+{
+	struct erdma_cep *cep = sk_to_cep(sk);
+
+	if (cep)
+		cep->sk_error_report(sk);
+}
+
+static void erdma_cm_llp_state_change(struct sock *sk)
+{
+	struct erdma_cep *cep;
+	void (*orig_state_change)(struct sock *sk);
+
+	read_lock(&sk->sk_callback_lock);
+
+	cep = sk_to_cep(sk);
+	if (!cep) {
+		read_unlock(&sk->sk_callback_lock);
+		return;
+	}
+	orig_state_change = cep->sk_state_change;
+
+	switch (sk->sk_state) {
+	case TCP_ESTABLISHED:
+		if (cep->state == ERDMA_EPSTATE_CONNECTING)
+			erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTED);
+		else
+			erdma_cm_queue_work(cep, ERDMA_CM_WORK_ACCEPT);
+		break;
+	case TCP_CLOSE:
+	case TCP_CLOSE_WAIT:
+		if (cep->state != ERDMA_EPSTATE_LISTENING)
+			erdma_cm_queue_work(cep, ERDMA_CM_WORK_PEER_CLOSE);
+		break;
+	default:
+		break;
+	}
+	read_unlock(&sk->sk_callback_lock);
+	orig_state_change(sk);
+}
+
+static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
+			      int laddrlen, struct sockaddr *raddr,
+			      int raddrlen, int flags)
+{
+	int ret;
+
+	sock_set_reuseaddr(s->sk);
+	ret = s->ops->bind(s, laddr, laddrlen);
+	if (ret)
+		return ret;
+	ret = s->ops->connect(s, raddr, raddrlen, flags);
+	return ret < 0 ? ret : 0;
+}
+
+int erdma_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
+{
+	struct erdma_dev *dev = to_edev(id->device);
+	struct erdma_qp *qp;
+	struct erdma_cep *cep = NULL;
+	struct socket *s = NULL;
+	struct sockaddr *laddr = (struct sockaddr *)&id->m_local_addr;
+	struct sockaddr *raddr = (struct sockaddr *)&id->m_remote_addr;
+	u16 pd_len = params->private_data_len;
+	int ret;
+
+	if (pd_len > MPA_MAX_PRIVDATA)
+		return -EINVAL;
+
+	if (params->ird > dev->attrs.max_ird ||
+	    params->ord > dev->attrs.max_ord)
+		return -EINVAL;
+
+	if (laddr->sa_family != AF_INET || raddr->sa_family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	qp = find_qp_by_qpn(dev, params->qpn);
+	if (!qp)
+		return -ENOENT;
+	erdma_qp_get(qp);
+
+	ret = sock_create(AF_INET, SOCK_STREAM, IPPROTO_TCP, &s);
+	if (ret < 0)
+		goto error_put_qp;
+
+	cep = erdma_cep_alloc(dev);
+	if (!cep) {
+		ret = -ENOMEM;
+		goto error_release_sock;
+	}
+
+	erdma_cep_set_inuse(cep);
+
+	/* Associate QP with CEP */
+	erdma_cep_get(cep);
+	qp->cep = cep;
+	cep->qp = qp;
+
+	/* Associate cm_id with CEP */
+	id->add_ref(id);
+	cep->cm_id = id;
+
+	/*
+	 * 6: Allocate a sufficient number of work elements
+	 * to allow concurrent handling of local + peer close
+	 * events, MPA header processing + MPA timeout, connected event
+	 * and connect timeout.
+	 */
+	ret = erdma_cm_alloc_work(cep, 6);
+	if (ret != 0) {
+		ret = -ENOMEM;
+		goto error_release_cep;
+	}
+
+	cep->ird = params->ird;
+	cep->ord = params->ord;
+	cep->state = ERDMA_EPSTATE_CONNECTING;
+
+	erdma_cep_socket_assoc(cep, s);
+
+	if (pd_len) {
+		cep->pd_len = pd_len;
+		cep->private_data = kmalloc(pd_len, GFP_KERNEL);
+		if (!cep->private_data) {
+			ret = -ENOMEM;
+			goto error_disassoc;
+		}
+
+		memcpy(cep->private_data, params->private_data,
+		       params->private_data_len);
+	}
+
+	ret = kernel_bindconnect(s, laddr, sizeof(*laddr), raddr,
+				 sizeof(*raddr), O_NONBLOCK);
+	if (ret != -EINPROGRESS && ret != 0) {
+		goto error_disassoc;
+	} else if (ret == 0) {
+		ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTED);
+		if (ret)
+			goto error_disassoc;
+	} else {
+		ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTTIMEOUT);
+		if (ret)
+			goto error_disassoc;
+	}
+
+	erdma_cep_set_free(cep);
+	return 0;
+
+error_disassoc:
+	kfree(cep->private_data);
+	cep->private_data = NULL;
+	cep->pd_len = 0;
+
+	erdma_socket_disassoc(s);
+
+error_release_cep:
+	/* disassoc with cm_id */
+	cep->cm_id = NULL;
+	id->rem_ref(id);
+
+	/* disassoc with qp */
+	qp->cep = NULL;
+	erdma_cep_put(cep);
+	cep->qp = NULL;
+
+	cep->state = ERDMA_EPSTATE_CLOSED;
+
+	erdma_cep_set_free(cep);
+
+	/* release the cep. */
+	erdma_cep_put(cep);
+
+error_release_sock:
+	if (s)
+		sock_release(s);
+error_put_qp:
+	erdma_qp_put(qp);
+
+	return ret;
+}
+
+int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params)
+{
+	struct erdma_dev *dev = to_edev(id->device);
+	struct erdma_cep *cep = (struct erdma_cep *)id->provider_data;
+	struct erdma_qp *qp;
+	struct erdma_qp_attrs qp_attrs;
+	int ret;
+
+	erdma_cep_set_inuse(cep);
+	erdma_cep_put(cep);
+
+	/* Free lingering inbound private data */
+	if (cep->mpa.hdr.params.pd_len) {
+		cep->mpa.hdr.params.pd_len = 0;
+		kfree(cep->mpa.pdata);
+		cep->mpa.pdata = NULL;
+	}
+	erdma_cancel_mpatimer(cep);
+
+	if (cep->state != ERDMA_EPSTATE_RECVD_MPAREQ) {
+		erdma_cep_set_free(cep);
+		erdma_cep_put(cep);
+
+		return -ECONNRESET;
+	}
+
+	qp = find_qp_by_qpn(dev, params->qpn);
+	if (!qp)
+		return -ENOENT;
+	erdma_qp_get(qp);
+
+	down_write(&qp->state_lock);
+	if (qp->attrs.state > ERDMA_QP_STATE_RTR) {
+		ret = -EINVAL;
+		up_write(&qp->state_lock);
+		goto error;
+	}
+
+	if (params->ord > dev->attrs.max_ord ||
+	    params->ird > dev->attrs.max_ord) {
+		ret = -EINVAL;
+		up_write(&qp->state_lock);
+		goto error;
+	}
+
+	if (params->private_data_len > MPA_MAX_PRIVDATA) {
+		ret = -EINVAL;
+		up_write(&qp->state_lock);
+		goto error;
+	}
+
+	cep->ird = params->ird;
+	cep->ord = params->ord;
+
+	cep->cm_id = id;
+	id->add_ref(id);
+
+	memset(&qp_attrs, 0, sizeof(qp_attrs));
+	qp_attrs.orq_size = params->ord;
+	qp_attrs.irq_size = params->ird;
+
+	qp_attrs.state = ERDMA_QP_STATE_RTS;
+
+	/* Associate QP with CEP */
+	erdma_cep_get(cep);
+	qp->cep = cep;
+	cep->qp = qp;
+
+	cep->state = ERDMA_EPSTATE_RDMA_MODE;
+
+	qp->attrs.qp_type = ERDMA_QP_PASSIVE;
+	qp->attrs.pd_len = params->private_data_len;
+
+	if (qp->attrs.cc != __mpa_ext_cc(cep->mpa.ext_data.bits))
+		qp->attrs.cc = COMPROMISE_CC;
+
+	/* move to rts */
+	ret = erdma_modify_qp_internal(qp, &qp_attrs,
+				       ERDMA_QP_ATTR_STATE |
+				       ERDMA_QP_ATTR_ORD |
+				       ERDMA_QP_ATTR_LLP_HANDLE |
+				       ERDMA_QP_ATTR_IRD |
+				       ERDMA_QP_ATTR_MPA);
+	up_write(&qp->state_lock);
+
+	if (ret)
+		goto error;
+
+	cep->mpa.ext_data.bits = 0;
+	__mpa_ext_set_cc(&cep->mpa.ext_data.bits, qp->attrs.cc);
+	cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie);
+
+	ret = erdma_send_mpareqrep(cep, params->private_data,
+				   params->private_data_len);
+	if (!ret) {
+		ret = erdma_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
+		if (ret)
+			goto error;
+
+		erdma_cep_set_free(cep);
+
+		return 0;
+	}
+
+error:
+	erdma_socket_disassoc(cep->sock);
+	sock_release(cep->sock);
+	cep->sock = NULL;
+
+	cep->state = ERDMA_EPSTATE_CLOSED;
+
+	if (cep->cm_id) {
+		cep->cm_id->rem_ref(id);
+		cep->cm_id = NULL;
+	}
+
+	if (qp->cep) {
+		erdma_cep_put(cep);
+		qp->cep = NULL;
+	}
+
+	cep->qp = NULL;
+	erdma_qp_put(qp);
+
+	erdma_cep_set_free(cep);
+	erdma_cep_put(cep);
+
+	return ret;
+}
+
+int erdma_reject(struct iw_cm_id *id, const void *pdata, u8 plen)
+{
+	struct erdma_cep *cep = (struct erdma_cep *)id->provider_data;
+
+	erdma_cep_set_inuse(cep);
+	erdma_cep_put(cep);
+
+	erdma_cancel_mpatimer(cep);
+
+	if (cep->state != ERDMA_EPSTATE_RECVD_MPAREQ) {
+		erdma_cep_set_free(cep);
+		erdma_cep_put(cep);
+
+		return -ECONNRESET;
+	}
+
+	if (__mpa_rr_revision(cep->mpa.hdr.params.bits) == MPA_REVISION_EXT_1) {
+		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */
+		erdma_send_mpareqrep(cep, pdata, plen);
+	}
+
+	erdma_socket_disassoc(cep->sock);
+	sock_release(cep->sock);
+	cep->sock = NULL;
+
+	cep->state = ERDMA_EPSTATE_CLOSED;
+
+	erdma_cep_set_free(cep);
+	erdma_cep_put(cep);
+
+	return 0;
+}
+
+int erdma_create_listen(struct iw_cm_id *id, int backlog)
+{
+	struct socket *s;
+	struct erdma_cep *cep = NULL;
+	int ret = 0;
+	struct erdma_dev *dev = to_edev(id->device);
+	int addr_family = id->local_addr.ss_family;
+	struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr);
+
+	if (addr_family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	ret = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s);
+	if (ret < 0)
+		return ret;
+
+	sock_set_reuseaddr(s->sk);
+
+	/* For wildcard addr, limit binding to current device only */
+	if (ipv4_is_zeronet(laddr->sin_addr.s_addr))
+		s->sk->sk_bound_dev_if = dev->netdev->ifindex;
+
+	ret = s->ops->bind(s, (struct sockaddr *)laddr,
+			   sizeof(struct sockaddr_in));
+	if (ret)
+		goto error;
+
+	cep = erdma_cep_alloc(dev);
+	if (!cep) {
+		ret = -ENOMEM;
+		goto error;
+	}
+	erdma_cep_socket_assoc(cep, s);
+
+	ret = erdma_cm_alloc_work(cep, backlog);
+	if (ret)
+		goto error;
+
+	ret = s->ops->listen(s, backlog);
+	if (ret)
+		goto error;
+
+	cep->cm_id = id;
+	id->add_ref(id);
+
+	if (!id->provider_data) {
+		id->provider_data =
+			kmalloc(sizeof(struct list_head), GFP_KERNEL);
+		if (!id->provider_data) {
+			ret = -ENOMEM;
+			goto error;
+		}
+		INIT_LIST_HEAD((struct list_head *)id->provider_data);
+	}
+
+	list_add_tail(&cep->listenq, (struct list_head *)id->provider_data);
+	cep->state = ERDMA_EPSTATE_LISTENING;
+
+	return 0;
+
+error:
+	if (cep) {
+		erdma_cep_set_inuse(cep);
+
+		if (cep->cm_id) {
+			cep->cm_id->rem_ref(cep->cm_id);
+			cep->cm_id = NULL;
+		}
+		cep->sock = NULL;
+		erdma_socket_disassoc(s);
+		cep->state = ERDMA_EPSTATE_CLOSED;
+
+		erdma_cep_set_free(cep);
+		erdma_cep_put(cep);
+	}
+	sock_release(s);
+
+	return ret;
+}
+
+static void erdma_drop_listeners(struct iw_cm_id *id)
+{
+	struct list_head *p, *tmp;
+	/*
+	 * In case of a wildcard rdma_listen on a multi-homed device,
+	 * a listener's IWCM id is associated with more than one listening CEP.
+	 */
+	list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) {
+		struct erdma_cep *cep =
+			list_entry(p, struct erdma_cep, listenq);
+
+		list_del(p);
+
+		erdma_cep_set_inuse(cep);
+
+		if (cep->cm_id) {
+			cep->cm_id->rem_ref(cep->cm_id);
+			cep->cm_id = NULL;
+		}
+		if (cep->sock) {
+			erdma_socket_disassoc(cep->sock);
+			sock_release(cep->sock);
+			cep->sock = NULL;
+		}
+		cep->state = ERDMA_EPSTATE_CLOSED;
+		erdma_cep_set_free(cep);
+		erdma_cep_put(cep);
+	}
+}
+
+int erdma_destroy_listen(struct iw_cm_id *id)
+{
+	if (!id->provider_data)
+		return 0;
+
+	erdma_drop_listeners(id);
+	kfree(id->provider_data);
+	id->provider_data = NULL;
+
+	return 0;
+}
+
+int erdma_cm_init(void)
+{
+	erdma_cm_wq = create_singlethread_workqueue("erdma_cm_wq");
+	if (!erdma_cm_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void erdma_cm_exit(void)
+{
+	if (erdma_cm_wq)
+		destroy_workqueue(erdma_cm_wq);
+}
diff --git a/drivers/infiniband/hw/erdma/erdma_cm.h b/drivers/infiniband/hw/erdma/erdma_cm.h
new file mode 100644
index 000000000000..8a3f998fec9b
--- /dev/null
+++ b/drivers/infiniband/hw/erdma/erdma_cm.h
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+
+/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
+/*          Kai Shen <kaishen@linux.alibaba.com> */
+/* Copyright (c) 2020-2022, Alibaba Group. */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/*          Greg Joyce <greg@opengridcomputing.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+/* Copyright (c) 2017, Open Grid Computing, Inc. */
+
+#ifndef __ERDMA_CM_H__
+#define __ERDMA_CM_H__
+
+#include <linux/tcp.h>
+#include <net/sock.h>
+#include <rdma/iw_cm.h>
+
+/* iWarp MPA protocol defs */
+#define MPA_REVISION_EXT_1 129
+#define MPA_MAX_PRIVDATA RDMA_MAX_PRIVATE_DATA
+#define MPA_KEY_REQ "MPA ID Req Frame"
+#define MPA_KEY_REP "MPA ID Rep Frame"
+#define MPA_KEY_SIZE 16
+#define MPA_DEFAULT_HDR_LEN 28
+
+struct mpa_rr_params {
+	__be16 bits;
+	__be16 pd_len;
+};
+
+/*
+ * MPA request/response Hdr bits & fields
+ */
+enum {
+	MPA_RR_FLAG_MARKERS = __cpu_to_be16(0x8000),
+	MPA_RR_FLAG_CRC = __cpu_to_be16(0x4000),
+	MPA_RR_FLAG_REJECT = __cpu_to_be16(0x2000),
+	MPA_RR_RESERVED = __cpu_to_be16(0x1f00),
+	MPA_RR_MASK_REVISION = __cpu_to_be16(0x00ff)
+};
+
+/*
+ * MPA request/reply header
+ */
+struct mpa_rr {
+	u8 key[16];
+	struct mpa_rr_params params;
+};
+
+struct erdma_mpa_ext {
+	__be32 cookie;
+	__be32 bits;
+};
+
+enum {
+	MPA_EXT_FLAG_CC = cpu_to_be32(0x0000000f),
+};
+
+struct erdma_mpa_info {
+	struct mpa_rr hdr; /* peer mpa hdr in host byte order */
+	struct erdma_mpa_ext ext_data;
+	char *pdata;
+	int bytes_rcvd;
+};
+
+struct erdma_sk_upcalls {
+	void (*sk_state_change)(struct sock *sk);
+	void (*sk_data_ready)(struct sock *sk, int bytes);
+	void (*sk_error_report)(struct sock *sk);
+};
+
+struct erdma_dev;
+
+enum erdma_cep_state {
+	ERDMA_EPSTATE_IDLE = 1,
+	ERDMA_EPSTATE_LISTENING,
+	ERDMA_EPSTATE_CONNECTING,
+	ERDMA_EPSTATE_AWAIT_MPAREQ,
+	ERDMA_EPSTATE_RECVD_MPAREQ,
+	ERDMA_EPSTATE_AWAIT_MPAREP,
+	ERDMA_EPSTATE_RDMA_MODE,
+	ERDMA_EPSTATE_CLOSED
+};
+
+struct erdma_cep {
+	struct iw_cm_id *cm_id;
+	struct erdma_dev *dev;
+	struct list_head devq;
+	spinlock_t lock;
+	struct kref ref;
+	int in_use;
+	wait_queue_head_t waitq;
+	enum erdma_cep_state state;
+
+	struct list_head listenq;
+	struct erdma_cep *listen_cep;
+
+	struct erdma_qp *qp;
+	struct socket *sock;
+
+	struct erdma_cm_work *mpa_timer;
+	struct list_head work_freelist;
+
+	struct erdma_mpa_info mpa;
+	int ord;
+	int ird;
+
+	int pd_len;
+	/* hold user's private data. */
+	void *private_data;
+
+	/* Saved upcalls of socket llp.sock */
+	void (*sk_state_change)(struct sock *sk);
+	void (*sk_data_ready)(struct sock *sk);
+	void (*sk_error_report)(struct sock *sk);
+};
+
+#define MPAREQ_TIMEOUT (HZ * 20)
+#define MPAREP_TIMEOUT (HZ * 10)
+#define CONNECT_TIMEOUT (HZ * 10)
+
+enum erdma_work_type {
+	ERDMA_CM_WORK_ACCEPT = 1,
+	ERDMA_CM_WORK_READ_MPAHDR,
+	ERDMA_CM_WORK_CLOSE_LLP, /* close socket */
+	ERDMA_CM_WORK_PEER_CLOSE, /* socket indicated peer close */
+	ERDMA_CM_WORK_MPATIMEOUT,
+	ERDMA_CM_WORK_CONNECTED,
+	ERDMA_CM_WORK_CONNECTTIMEOUT
+};
+
+struct erdma_cm_work {
+	struct delayed_work work;
+	struct list_head list;
+	enum erdma_work_type type;
+	struct erdma_cep *cep;
+};
+
+#define to_sockaddr_in(a) (*(struct sockaddr_in *)(&(a)))
+
+static inline int getname_peer(struct socket *s, struct sockaddr_storage *a)
+{
+	return s->ops->getname(s, (struct sockaddr *)a, 1);
+}
+
+static inline int getname_local(struct socket *s, struct sockaddr_storage *a)
+{
+	return s->ops->getname(s, (struct sockaddr *)a, 0);
+}
+
+int erdma_connect(struct iw_cm_id *id, struct iw_cm_conn_param *param);
+int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *param);
+int erdma_reject(struct iw_cm_id *id, const void *pdata, u8 plen);
+int erdma_create_listen(struct iw_cm_id *id, int backlog);
+int erdma_destroy_listen(struct iw_cm_id *id);
+
+void erdma_cep_get(struct erdma_cep *ceq);
+void erdma_cep_put(struct erdma_cep *ceq);
+int erdma_cm_queue_work(struct erdma_cep *ceq, enum erdma_work_type type);
+
+int erdma_cm_init(void);
+void erdma_cm_exit(void);
+
+#define sk_to_cep(sk) ((struct erdma_cep *)((sk)->sk_user_data))
+
+#endif
diff --git a/drivers/infiniband/hw/erdma/erdma_cmdq.c b/drivers/infiniband/hw/erdma/erdma_cmdq.c
new file mode 100644
index 000000000000..57da0c670472
--- /dev/null
+++ b/drivers/infiniband/hw/erdma/erdma_cmdq.c
@@ -0,0 +1,493 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
+/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
+/*          Kai Shen <kaishen@linux.alibaba.com> */
+/* Copyright (c) 2020-2022, Alibaba Group. */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/types.h>
+
+#include "erdma.h"
+#include "erdma_hw.h"
+#include "erdma_verbs.h"
+
+static void arm_cmdq_cq(struct erdma_cmdq *cmdq)
+{
+	struct erdma_dev *dev = container_of(cmdq, struct erdma_dev, cmdq);
+	u64 db_data = FIELD_PREP(ERDMA_CQDB_CI_MASK, cmdq->cq.ci) |
+		      FIELD_PREP(ERDMA_CQDB_ARM_MASK, 1) |
+		      FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cmdq->cq.cmdsn) |
+		      FIELD_PREP(ERDMA_CQDB_IDX_MASK, cmdq->cq.cmdsn);
+
+	*cmdq->cq.db_record = db_data;
+	writeq(db_data, dev->func_bar + ERDMA_CMDQ_CQDB_REG);
+
+	atomic64_inc(&cmdq->cq.armed_num);
+}
+
+static void kick_cmdq_db(struct erdma_cmdq *cmdq)
+{
+	struct erdma_dev *dev = container_of(cmdq, struct erdma_dev, cmdq);
+	u64 db_data = FIELD_PREP(ERDMA_CMD_HDR_WQEBB_INDEX_MASK, cmdq->sq.pi);
+
+	*cmdq->sq.db_record = db_data;
+	writeq(db_data, dev->func_bar + ERDMA_CMDQ_SQDB_REG);
+}
+
+static struct erdma_comp_wait *get_comp_wait(struct erdma_cmdq *cmdq)
+{
+	int comp_idx;
+
+	spin_lock(&cmdq->lock);
+	comp_idx = find_first_zero_bit(cmdq->comp_wait_bitmap,
+				       cmdq->max_outstandings);
+	if (comp_idx == cmdq->max_outstandings) {
+		spin_unlock(&cmdq->lock);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	__set_bit(comp_idx, cmdq->comp_wait_bitmap);
+	spin_unlock(&cmdq->lock);
+
+	return &cmdq->wait_pool[comp_idx];
+}
+
+static void put_comp_wait(struct erdma_cmdq *cmdq,
+			  struct erdma_comp_wait *comp_wait)
+{
+	int used;
+
+	cmdq->wait_pool[comp_wait->ctx_id].cmd_status = ERDMA_CMD_STATUS_INIT;
+	spin_lock(&cmdq->lock);
+	used = __test_and_clear_bit(comp_wait->ctx_id, cmdq->comp_wait_bitmap);
+	spin_unlock(&cmdq->lock);
+
+	WARN_ON(!used);
+}
+
+static int erdma_cmdq_wait_res_init(struct erdma_dev *dev,
+				    struct erdma_cmdq *cmdq)
+{
+	int i;
+
+	cmdq->wait_pool =
+		devm_kcalloc(&dev->pdev->dev, cmdq->max_outstandings,
+			     sizeof(struct erdma_comp_wait), GFP_KERNEL);
+	if (!cmdq->wait_pool)
+		return -ENOMEM;
+
+	spin_lock_init(&cmdq->lock);
+	cmdq->comp_wait_bitmap = devm_bitmap_zalloc(
+		&dev->pdev->dev, cmdq->max_outstandings, GFP_KERNEL);
+	if (!cmdq->comp_wait_bitmap)
+		return -ENOMEM;
+
+	for (i = 0; i < cmdq->max_outstandings; i++) {
+		init_completion(&cmdq->wait_pool[i].wait_event);
+		cmdq->wait_pool[i].ctx_id = i;
+	}
+
+	return 0;
+}
+
+static int erdma_cmdq_sq_init(struct erdma_dev *dev)
+{
+	struct erdma_cmdq *cmdq = &dev->cmdq;
+	struct erdma_cmdq_sq *sq = &cmdq->sq;
+	u32 buf_size;
+
+	sq->wqebb_cnt = SQEBB_COUNT(ERDMA_CMDQ_SQE_SIZE);
+	sq->depth = cmdq->max_outstandings * sq->wqebb_cnt;
+
+	buf_size = sq->depth << SQEBB_SHIFT;
+
+	sq->qbuf =
+		dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size),
+				   &sq->qbuf_dma_addr, GFP_KERNEL);
+	if (!sq->qbuf)
+		return -ENOMEM;
+
+	sq->db_record = (u64 *)(sq->qbuf + buf_size);
+
+	spin_lock_init(&sq->lock);
+
+	erdma_reg_write32(dev, ERDMA_REGS_CMDQ_SQ_ADDR_H_REG,
+			  upper_32_bits(sq->qbuf_dma_addr));
+	erdma_reg_write32(dev, ERDMA_REGS_CMDQ_SQ_ADDR_L_REG,
+			  lower_32_bits(sq->qbuf_dma_addr));
+	erdma_reg_write32(dev, ERDMA_REGS_CMDQ_DEPTH_REG, sq->depth);
+	erdma_reg_write64(dev, ERDMA_CMDQ_SQ_DB_HOST_ADDR_REG,
+			  sq->qbuf_dma_addr + buf_size);
+
+	return 0;
+}
+
+static int erdma_cmdq_cq_init(struct erdma_dev *dev)
+{
+	struct erdma_cmdq *cmdq = &dev->cmdq;
+	struct erdma_cmdq_cq *cq = &cmdq->cq;
+	u32 buf_size;
+
+	cq->depth = cmdq->sq.depth;
+	buf_size = cq->depth << CQE_SHIFT;
+
+	cq->qbuf =
+		dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size),
+				   &cq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO);
+	if (!cq->qbuf)
+		return -ENOMEM;
+
+	spin_lock_init(&cq->lock);
+
+	cq->db_record = (u64 *)(cq->qbuf + buf_size);
+
+	atomic64_set(&cq->armed_num, 0);
+
+	erdma_reg_write32(dev, ERDMA_REGS_CMDQ_CQ_ADDR_H_REG,
+			  upper_32_bits(cq->qbuf_dma_addr));
+	erdma_reg_write32(dev, ERDMA_REGS_CMDQ_CQ_ADDR_L_REG,
+			  lower_32_bits(cq->qbuf_dma_addr));
+	erdma_reg_write64(dev, ERDMA_CMDQ_CQ_DB_HOST_ADDR_REG,
+			  cq->qbuf_dma_addr + buf_size);
+
+	return 0;
+}
+
+static int erdma_cmdq_eq_init(struct erdma_dev *dev)
+{
+	struct erdma_cmdq *cmdq = &dev->cmdq;
+	struct erdma_eq *eq = &cmdq->eq;
+	u32 buf_size;
+
+	eq->depth = cmdq->max_outstandings;
+	buf_size = eq->depth << EQE_SHIFT;
+
+	eq->qbuf =
+		dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size),
+				   &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO);
+	if (!eq->qbuf)
+		return -ENOMEM;
+
+	spin_lock_init(&eq->lock);
+	atomic64_set(&eq->event_num, 0);
+
+	eq->db_addr =
+		(u64 __iomem *)(dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG);
+	eq->db_record = (u64 *)(eq->qbuf + buf_size);
+
+	erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_H_REG,
+			  upper_32_bits(eq->qbuf_dma_addr));
+	erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_L_REG,
+			  lower_32_bits(eq->qbuf_dma_addr));
+	erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_DEPTH_REG, eq->depth);
+	erdma_reg_write64(dev, ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG,
+			  eq->qbuf_dma_addr + buf_size);
+
+	return 0;
+}
+
+int erdma_cmdq_init(struct erdma_dev *dev)
+{
+	int err, i;
+	struct erdma_cmdq *cmdq = &dev->cmdq;
+	u32 sts, ctrl;
+
+	cmdq->max_outstandings = ERDMA_CMDQ_MAX_OUTSTANDING;
+	cmdq->use_event = false;
+
+	sema_init(&cmdq->credits, cmdq->max_outstandings);
+
+	err = erdma_cmdq_wait_res_init(dev, cmdq);
+	if (err)
+		return err;
+
+	err = erdma_cmdq_sq_init(dev);
+	if (err)
+		return err;
+
+	err = erdma_cmdq_cq_init(dev);
+	if (err)
+		goto err_destroy_sq;
+
+	err = erdma_cmdq_eq_init(dev);
+	if (err)
+		goto err_destroy_cq;
+
+	ctrl = FIELD_PREP(ERDMA_REG_DEV_CTRL_INIT_MASK, 1);
+	erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG, ctrl);
+
+	for (i = 0; i < ERDMA_WAIT_DEV_DONE_CNT; i++) {
+		sts = erdma_reg_read32_filed(dev, ERDMA_REGS_DEV_ST_REG,
+					     ERDMA_REG_DEV_ST_INIT_DONE_MASK);
+		if (sts)
+			break;
+
+		msleep(ERDMA_REG_ACCESS_WAIT_MS);
+	}
+
+	if (i == ERDMA_WAIT_DEV_DONE_CNT) {
+		dev_err(&dev->pdev->dev, "wait init done failed.\n");
+		err = -ETIMEDOUT;
+		goto err_destroy_eq;
+	}
+
+	set_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state);
+
+	return 0;
+
+err_destroy_eq:
+	dma_free_coherent(&dev->pdev->dev,
+			  (cmdq->eq.depth << EQE_SHIFT) +
+				  ERDMA_EXTRA_BUFFER_SIZE,
+			  cmdq->eq.qbuf, cmdq->eq.qbuf_dma_addr);
+
+err_destroy_cq:
+	dma_free_coherent(&dev->pdev->dev,
+			  (cmdq->cq.depth << CQE_SHIFT) +
+				  ERDMA_EXTRA_BUFFER_SIZE,
+			  cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr);
+
+err_destroy_sq:
+	dma_free_coherent(&dev->pdev->dev,
+			  (cmdq->sq.depth << SQEBB_SHIFT) +
+				  ERDMA_EXTRA_BUFFER_SIZE,
+			  cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr);
+
+	return err;
+}
+
+void erdma_finish_cmdq_init(struct erdma_dev *dev)
+{
+	/* after device init successfully, change cmdq to event mode. */
+	dev->cmdq.use_event = true;
+	arm_cmdq_cq(&dev->cmdq);
+}
+
+void erdma_cmdq_destroy(struct erdma_dev *dev)
+{
+	struct erdma_cmdq *cmdq = &dev->cmdq;
+
+	clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state);
+
+	dma_free_coherent(&dev->pdev->dev,
+			  (cmdq->eq.depth << EQE_SHIFT) +
+				  ERDMA_EXTRA_BUFFER_SIZE,
+			  cmdq->eq.qbuf, cmdq->eq.qbuf_dma_addr);
+	dma_free_coherent(&dev->pdev->dev,
+			  (cmdq->sq.depth << SQEBB_SHIFT) +
+				  ERDMA_EXTRA_BUFFER_SIZE,
+			  cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr);
+	dma_free_coherent(&dev->pdev->dev,
+			  (cmdq->cq.depth << CQE_SHIFT) +
+				  ERDMA_EXTRA_BUFFER_SIZE,
+			  cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr);
+}
+
+static void *get_next_valid_cmdq_cqe(struct erdma_cmdq *cmdq)
+{
+	__be32 *cqe = get_queue_entry(cmdq->cq.qbuf, cmdq->cq.ci,
+				      cmdq->cq.depth, CQE_SHIFT);
+	u32 owner = FIELD_GET(ERDMA_CQE_HDR_OWNER_MASK,
+			      __be32_to_cpu(READ_ONCE(*cqe)));
+
+	return owner ^ !!(cmdq->cq.ci & cmdq->cq.depth) ? cqe : NULL;
+}
+
+static void push_cmdq_sqe(struct erdma_cmdq *cmdq, u64 *req, size_t req_len,
+			  struct erdma_comp_wait *comp_wait)
+{
+	__le64 *wqe;
+	u64 hdr = *req;
+
+	comp_wait->cmd_status = ERDMA_CMD_STATUS_ISSUED;
+	reinit_completion(&comp_wait->wait_event);
+	comp_wait->sq_pi = cmdq->sq.pi;
+
+	wqe = get_queue_entry(cmdq->sq.qbuf, cmdq->sq.pi, cmdq->sq.depth,
+			      SQEBB_SHIFT);
+	memcpy(wqe, req, req_len);
+
+	cmdq->sq.pi += cmdq->sq.wqebb_cnt;
+	hdr |= FIELD_PREP(ERDMA_CMD_HDR_WQEBB_INDEX_MASK, cmdq->sq.pi) |
+	       FIELD_PREP(ERDMA_CMD_HDR_CONTEXT_COOKIE_MASK,
+			  comp_wait->ctx_id) |
+	       FIELD_PREP(ERDMA_CMD_HDR_WQEBB_CNT_MASK, cmdq->sq.wqebb_cnt - 1);
+	*wqe = cpu_to_le64(hdr);
+
+	kick_cmdq_db(cmdq);
+}
+
+static int erdma_poll_single_cmd_completion(struct erdma_cmdq *cmdq)
+{
+	struct erdma_comp_wait *comp_wait;
+	u32 hdr0, sqe_idx;
+	__be32 *cqe;
+	u16 ctx_id;
+	u64 *sqe;
+	int i;
+
+	cqe = get_next_valid_cmdq_cqe(cmdq);
+	if (!cqe)
+		return -EAGAIN;
+
+	cmdq->cq.ci++;
+
+	dma_rmb();
+	hdr0 = __be32_to_cpu(*cqe);
+	sqe_idx = __be32_to_cpu(*(cqe + 1));
+
+	sqe = get_queue_entry(cmdq->sq.qbuf, sqe_idx, cmdq->sq.depth,
+			      SQEBB_SHIFT);
+	ctx_id = FIELD_GET(ERDMA_CMD_HDR_CONTEXT_COOKIE_MASK, *sqe);
+	comp_wait = &cmdq->wait_pool[ctx_id];
+	if (comp_wait->cmd_status != ERDMA_CMD_STATUS_ISSUED)
+		return -EIO;
+
+	comp_wait->cmd_status = ERDMA_CMD_STATUS_FINISHED;
+	comp_wait->comp_status = FIELD_GET(ERDMA_CQE_HDR_SYNDROME_MASK, hdr0);
+	cmdq->sq.ci += cmdq->sq.wqebb_cnt;
+
+	for (i = 0; i < 4; i++)
+		comp_wait->comp_data[i] = __be32_to_cpu(*(cqe + 2 + i));
+
+	if (cmdq->use_event)
+		complete(&comp_wait->wait_event);
+
+	return 0;
+}
+
+static void erdma_polling_cmd_completions(struct erdma_cmdq *cmdq)
+{
+	unsigned long flags;
+	u16 comp_num;
+
+	spin_lock_irqsave(&cmdq->cq.lock, flags);
+
+	/* We must have less than # of max_outstandings
+	 * completions at one time.
+	 */
+	for (comp_num = 0; comp_num < cmdq->max_outstandings; comp_num++)
+		if (erdma_poll_single_cmd_completion(cmdq))
+			break;
+
+	if (comp_num && cmdq->use_event)
+		arm_cmdq_cq(cmdq);
+
+	spin_unlock_irqrestore(&cmdq->cq.lock, flags);
+}
+
+void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq)
+{
+	int got_event = 0;
+
+	if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state) ||
+	    !cmdq->use_event)
+		return;
+
+	while (get_next_valid_eqe(&cmdq->eq)) {
+		cmdq->eq.ci++;
+		got_event++;
+	}
+
+	if (got_event) {
+		cmdq->cq.cmdsn++;
+		erdma_polling_cmd_completions(cmdq);
+	}
+
+	notify_eq(&cmdq->eq);
+}
+
+static int erdma_poll_cmd_completion(struct erdma_comp_wait *comp_ctx,
+				     struct erdma_cmdq *cmdq, u32 timeout)
+{
+	unsigned long comp_timeout = jiffies + msecs_to_jiffies(timeout);
+
+	while (1) {
+		erdma_polling_cmd_completions(cmdq);
+		if (comp_ctx->cmd_status != ERDMA_CMD_STATUS_ISSUED)
+			break;
+
+		if (time_is_before_jiffies(comp_timeout))
+			return -ETIME;
+
+		msleep(20);
+	}
+
+	return 0;
+}
+
+static int erdma_wait_cmd_completion(struct erdma_comp_wait *comp_ctx,
+				     struct erdma_cmdq *cmdq, u32 timeout)
+{
+	unsigned long flags = 0;
+
+	wait_for_completion_timeout(&comp_ctx->wait_event,
+				    msecs_to_jiffies(timeout));
+
+	if (unlikely(comp_ctx->cmd_status != ERDMA_CMD_STATUS_FINISHED)) {
+		spin_lock_irqsave(&cmdq->cq.lock, flags);
+		comp_ctx->cmd_status = ERDMA_CMD_STATUS_TIMEOUT;
+		spin_unlock_irqrestore(&cmdq->cq.lock, flags);
+		return -ETIME;
+	}
+
+	return 0;
+}
+
+void erdma_cmdq_build_reqhdr(u64 *hdr, u32 mod, u32 op)
+{
+	*hdr = FIELD_PREP(ERDMA_CMD_HDR_SUB_MOD_MASK, mod) |
+	       FIELD_PREP(ERDMA_CMD_HDR_OPCODE_MASK, op);
+}
+
+int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, u64 *req, u32 req_size,
+			u64 *resp0, u64 *resp1)
+{
+	struct erdma_comp_wait *comp_wait;
+	int ret;
+
+	if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state))
+		return -ENODEV;
+
+	down(&cmdq->credits);
+
+	comp_wait = get_comp_wait(cmdq);
+	if (IS_ERR(comp_wait)) {
+		clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state);
+		set_bit(ERDMA_CMDQ_STATE_CTX_ERR_BIT, &cmdq->state);
+		up(&cmdq->credits);
+		return PTR_ERR(comp_wait);
+	}
+
+	spin_lock(&cmdq->sq.lock);
+	push_cmdq_sqe(cmdq, req, req_size, comp_wait);
+	spin_unlock(&cmdq->sq.lock);
+
+	if (cmdq->use_event)
+		ret = erdma_wait_cmd_completion(comp_wait, cmdq,
+						ERDMA_CMDQ_TIMEOUT_MS);
+	else
+		ret = erdma_poll_cmd_completion(comp_wait, cmdq,
+						ERDMA_CMDQ_TIMEOUT_MS);
+
+	if (ret) {
+		set_bit(ERDMA_CMDQ_STATE_TIMEOUT_BIT, &cmdq->state);
+		clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state);
+		goto out;
+	}
+
+	if (comp_wait->comp_status)
+		ret = -EIO;
+
+	if (resp0 && resp1) {
+		*resp0 = *((u64 *)&comp_wait->comp_data[0]);
+		*resp1 = *((u64 *)&comp_wait->comp_data[2]);
+	}
+	put_comp_wait(cmdq, comp_wait);
+
+out:
+	up(&cmdq->credits);
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/erdma/erdma_cq.c b/drivers/infiniband/hw/erdma/erdma_cq.c
new file mode 100644
index 000000000000..751c7f9f0de7
--- /dev/null
+++ b/drivers/infiniband/hw/erdma/erdma_cq.c
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
+/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
+/*          Kai Shen <kaishen@linux.alibaba.com> */
+/* Copyright (c) 2020-2022, Alibaba Group. */
+
+#include <rdma/ib_verbs.h>
+
+#include "erdma_hw.h"
+#include "erdma_verbs.h"
+
+static void *get_next_valid_cqe(struct erdma_cq *cq)
+{
+	__be32 *cqe = get_queue_entry(cq->kern_cq.qbuf, cq->kern_cq.ci,
+				      cq->depth, CQE_SHIFT);
+	u32 owner = FIELD_GET(ERDMA_CQE_HDR_OWNER_MASK,
+			      __be32_to_cpu(READ_ONCE(*cqe)));
+
+	return owner ^ !!(cq->kern_cq.ci & cq->depth) ? cqe : NULL;
+}
+
+static void notify_cq(struct erdma_cq *cq, u8 solcitied)
+{
+	u64 db_data =
+		FIELD_PREP(ERDMA_CQDB_IDX_MASK, (cq->kern_cq.notify_cnt)) |
+		FIELD_PREP(ERDMA_CQDB_CQN_MASK, cq->cqn) |
+		FIELD_PREP(ERDMA_CQDB_ARM_MASK, 1) |
+		FIELD_PREP(ERDMA_CQDB_SOL_MASK, solcitied) |
+		FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cq->kern_cq.cmdsn) |
+		FIELD_PREP(ERDMA_CQDB_CI_MASK, cq->kern_cq.ci);
+
+	*cq->kern_cq.db_record = db_data;
+	writeq(db_data, cq->kern_cq.db);
+}
+
+int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	struct erdma_cq *cq = to_ecq(ibcq);
+	unsigned long irq_flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&cq->kern_cq.lock, irq_flags);
+
+	notify_cq(cq, (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED);
+
+	if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && get_next_valid_cqe(cq))
+		ret = 1;
+
+	cq->kern_cq.notify_cnt++;
+
+	spin_unlock_irqrestore(&cq->kern_cq.lock, irq_flags);
+
+	return ret;
+}
+
+static const enum ib_wc_opcode wc_mapping_table[ERDMA_NUM_OPCODES] = {
+	[ERDMA_OP_WRITE] = IB_WC_RDMA_WRITE,
+	[ERDMA_OP_READ] = IB_WC_RDMA_READ,
+	[ERDMA_OP_SEND] = IB_WC_SEND,
+	[ERDMA_OP_SEND_WITH_IMM] = IB_WC_SEND,
+	[ERDMA_OP_RECEIVE] = IB_WC_RECV,
+	[ERDMA_OP_RECV_IMM] = IB_WC_RECV_RDMA_WITH_IMM,
+	[ERDMA_OP_RECV_INV] = IB_WC_RECV,
+	[ERDMA_OP_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
+	[ERDMA_OP_INVALIDATE] = IB_WC_LOCAL_INV,
+	[ERDMA_OP_RSP_SEND_IMM] = IB_WC_RECV,
+	[ERDMA_OP_SEND_WITH_INV] = IB_WC_SEND,
+	[ERDMA_OP_REG_MR] = IB_WC_REG_MR,
+	[ERDMA_OP_LOCAL_INV] = IB_WC_LOCAL_INV,
+	[ERDMA_OP_READ_WITH_INV] = IB_WC_RDMA_READ,
+};
+
+static const struct {
+	enum erdma_wc_status erdma;
+	enum ib_wc_status base;
+	enum erdma_vendor_err vendor;
+} map_cqe_status[ERDMA_NUM_WC_STATUS] = {
+	{ ERDMA_WC_SUCCESS, IB_WC_SUCCESS, ERDMA_WC_VENDOR_NO_ERR },
+	{ ERDMA_WC_GENERAL_ERR, IB_WC_GENERAL_ERR, ERDMA_WC_VENDOR_NO_ERR },
+	{ ERDMA_WC_RECV_WQE_FORMAT_ERR, IB_WC_GENERAL_ERR,
+	  ERDMA_WC_VENDOR_INVALID_RQE },
+	{ ERDMA_WC_RECV_STAG_INVALID_ERR, IB_WC_REM_ACCESS_ERR,
+	  ERDMA_WC_VENDOR_RQE_INVALID_STAG },
+	{ ERDMA_WC_RECV_ADDR_VIOLATION_ERR, IB_WC_REM_ACCESS_ERR,
+	  ERDMA_WC_VENDOR_RQE_ADDR_VIOLATION },
+	{ ERDMA_WC_RECV_RIGHT_VIOLATION_ERR, IB_WC_REM_ACCESS_ERR,
+	  ERDMA_WC_VENDOR_RQE_ACCESS_RIGHT_ERR },
+	{ ERDMA_WC_RECV_PDID_ERR, IB_WC_REM_ACCESS_ERR,
+	  ERDMA_WC_VENDOR_RQE_INVALID_PD },
+	{ ERDMA_WC_RECV_WARRPING_ERR, IB_WC_REM_ACCESS_ERR,
+	  ERDMA_WC_VENDOR_RQE_WRAP_ERR },
+	{ ERDMA_WC_SEND_WQE_FORMAT_ERR, IB_WC_LOC_QP_OP_ERR,
+	  ERDMA_WC_VENDOR_INVALID_SQE },
+	{ ERDMA_WC_SEND_WQE_ORD_EXCEED, IB_WC_GENERAL_ERR,
+	  ERDMA_WC_VENDOR_ZERO_ORD },
+	{ ERDMA_WC_SEND_STAG_INVALID_ERR, IB_WC_LOC_ACCESS_ERR,
+	  ERDMA_WC_VENDOR_SQE_INVALID_STAG },
+	{ ERDMA_WC_SEND_ADDR_VIOLATION_ERR, IB_WC_LOC_ACCESS_ERR,
+	  ERDMA_WC_VENDOR_SQE_ADDR_VIOLATION },
+	{ ERDMA_WC_SEND_RIGHT_VIOLATION_ERR, IB_WC_LOC_ACCESS_ERR,
+	  ERDMA_WC_VENDOR_SQE_ACCESS_ERR },
+	{ ERDMA_WC_SEND_PDID_ERR, IB_WC_LOC_ACCESS_ERR,
+	  ERDMA_WC_VENDOR_SQE_INVALID_PD },
+	{ ERDMA_WC_SEND_WARRPING_ERR, IB_WC_LOC_ACCESS_ERR,
+	  ERDMA_WC_VENDOR_SQE_WARP_ERR },
+	{ ERDMA_WC_FLUSH_ERR, IB_WC_WR_FLUSH_ERR, ERDMA_WC_VENDOR_NO_ERR },
+	{ ERDMA_WC_RETRY_EXC_ERR, IB_WC_RETRY_EXC_ERR, ERDMA_WC_VENDOR_NO_ERR },
+};
+
+#define ERDMA_POLLCQ_NO_QP 1
+
+static int erdma_poll_one_cqe(struct erdma_cq *cq, struct ib_wc *wc)
+{
+	struct erdma_dev *dev = to_edev(cq->ibcq.device);
+	u8 opcode, syndrome, qtype;
+	struct erdma_kqp *kern_qp;
+	struct erdma_cqe *cqe;
+	struct erdma_qp *qp;
+	u16 wqe_idx, depth;
+	u32 qpn, cqe_hdr;
+	u64 *id_table;
+	u64 *wqe_hdr;
+
+	cqe = get_next_valid_cqe(cq);
+	if (!cqe)
+		return -EAGAIN;
+
+	cq->kern_cq.ci++;
+
+	/* cqbuf should be ready when we poll */
+	dma_rmb();
+
+	qpn = be32_to_cpu(cqe->qpn);
+	wqe_idx = be32_to_cpu(cqe->qe_idx);
+	cqe_hdr = be32_to_cpu(cqe->hdr);
+
+	qp = find_qp_by_qpn(dev, qpn);
+	if (!qp)
+		return ERDMA_POLLCQ_NO_QP;
+
+	kern_qp = &qp->kern_qp;
+
+	qtype = FIELD_GET(ERDMA_CQE_HDR_QTYPE_MASK, cqe_hdr);
+	syndrome = FIELD_GET(ERDMA_CQE_HDR_SYNDROME_MASK, cqe_hdr);
+	opcode = FIELD_GET(ERDMA_CQE_HDR_OPCODE_MASK, cqe_hdr);
+
+	if (qtype == ERDMA_CQE_QTYPE_SQ) {
+		id_table = kern_qp->swr_tbl;
+		depth = qp->attrs.sq_size;
+		wqe_hdr = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx,
+					  qp->attrs.sq_size, SQEBB_SHIFT);
+		kern_qp->sq_ci =
+			FIELD_GET(ERDMA_SQE_HDR_WQEBB_CNT_MASK, *wqe_hdr) +
+			wqe_idx + 1;
+	} else {
+		id_table = kern_qp->rwr_tbl;
+		depth = qp->attrs.rq_size;
+	}
+	wc->wr_id = id_table[wqe_idx & (depth - 1)];
+	wc->byte_len = be32_to_cpu(cqe->size);
+
+	wc->wc_flags = 0;
+
+	wc->opcode = wc_mapping_table[opcode];
+	if (opcode == ERDMA_OP_RECV_IMM || opcode == ERDMA_OP_RSP_SEND_IMM) {
+		wc->ex.imm_data = cpu_to_be32(le32_to_cpu(cqe->imm_data));
+		wc->wc_flags |= IB_WC_WITH_IMM;
+	} else if (opcode == ERDMA_OP_RECV_INV) {
+		wc->ex.invalidate_rkey = be32_to_cpu(cqe->inv_rkey);
+		wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+	}
+
+	if (syndrome >= ERDMA_NUM_WC_STATUS)
+		syndrome = ERDMA_WC_GENERAL_ERR;
+
+	wc->status = map_cqe_status[syndrome].base;
+	wc->vendor_err = map_cqe_status[syndrome].vendor;
+	wc->qp = &qp->ibqp;
+
+	return 0;
+}
+
+int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct erdma_cq *cq = to_ecq(ibcq);
+	unsigned long flags;
+	int npolled, ret;
+
+	spin_lock_irqsave(&cq->kern_cq.lock, flags);
+
+	for (npolled = 0; npolled < num_entries;) {
+		ret = erdma_poll_one_cqe(cq, wc + npolled);
+
+		if (ret == -EAGAIN) /* no received new CQEs. */
+			break;
+		else if (ret) /* ignore invalid CQEs. */
+			continue;
+
+		npolled++;
+	}
+
+	spin_unlock_irqrestore(&cq->kern_cq.lock, flags);
+
+	return npolled;
+}
diff --git a/drivers/infiniband/hw/erdma/erdma_eq.c b/drivers/infiniband/hw/erdma/erdma_eq.c
new file mode 100644
index 000000000000..8f2d094e0227
--- /dev/null
+++ b/drivers/infiniband/hw/erdma/erdma_eq.c
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
+/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
+/*          Kai Shen <kaishen@linux.alibaba.com> */
+/* Copyright (c) 2020-2022, Alibaba Group. */
+
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/types.h>
+
+#include "erdma.h"
+#include "erdma_hw.h"
+#include "erdma_verbs.h"
+
+#define MAX_POLL_CHUNK_SIZE 16
+
+void notify_eq(struct erdma_eq *eq)
+{
+	u64 db_data = FIELD_PREP(ERDMA_EQDB_CI_MASK, eq->ci) |
+		      FIELD_PREP(ERDMA_EQDB_ARM_MASK, 1);
+
+	*eq->db_record = db_data;
+	writeq(db_data, eq->db_addr);
+
+	atomic64_inc(&eq->notify_num);
+}
+
+void *get_next_valid_eqe(struct erdma_eq *eq)
+{
+	u64 *eqe = get_queue_entry(eq->qbuf, eq->ci, eq->depth, EQE_SHIFT);
+	u32 owner = FIELD_GET(ERDMA_CEQE_HDR_O_MASK, READ_ONCE(*eqe));
+
+	return owner ^ !!(eq->ci & eq->depth) ? eqe : NULL;
+}
+
+void erdma_aeq_event_handler(struct erdma_dev *dev)
+{
+	struct erdma_aeqe *aeqe;
+	u32 cqn, qpn;
+	struct erdma_qp *qp;
+	struct erdma_cq *cq;
+	struct ib_event event;
+	u32 poll_cnt = 0;
+
+	memset(&event, 0, sizeof(event));
+
+	while (poll_cnt < MAX_POLL_CHUNK_SIZE) {
+		aeqe = get_next_valid_eqe(&dev->aeq);
+		if (!aeqe)
+			break;
+
+		dma_rmb();
+
+		dev->aeq.ci++;
+		atomic64_inc(&dev->aeq.event_num);
+		poll_cnt++;
+
+		if (FIELD_GET(ERDMA_AEQE_HDR_TYPE_MASK,
+			      le32_to_cpu(aeqe->hdr)) == ERDMA_AE_TYPE_CQ_ERR) {
+			cqn = le32_to_cpu(aeqe->event_data0);
+			cq = find_cq_by_cqn(dev, cqn);
+			if (!cq)
+				continue;
+
+			event.device = cq->ibcq.device;
+			event.element.cq = &cq->ibcq;
+			event.event = IB_EVENT_CQ_ERR;
+			if (cq->ibcq.event_handler)
+				cq->ibcq.event_handler(&event,
+						       cq->ibcq.cq_context);
+		} else {
+			qpn = le32_to_cpu(aeqe->event_data0);
+			qp = find_qp_by_qpn(dev, qpn);
+			if (!qp)
+				continue;
+
+			event.device = qp->ibqp.device;
+			event.element.qp = &qp->ibqp;
+			event.event = IB_EVENT_QP_FATAL;
+			if (qp->ibqp.event_handler)
+				qp->ibqp.event_handler(&event,
+						       qp->ibqp.qp_context);
+		}
+	}
+
+	notify_eq(&dev->aeq);
+}
+
+int erdma_aeq_init(struct erdma_dev *dev)
+{
+	struct erdma_eq *eq = &dev->aeq;
+	u32 buf_size;
+
+	eq->depth = ERDMA_DEFAULT_EQ_DEPTH;
+	buf_size = eq->depth << EQE_SHIFT;
+
+	eq->qbuf =
+		dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size),
+				   &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO);
+	if (!eq->qbuf)
+		return -ENOMEM;
+
+	spin_lock_init(&eq->lock);
+	atomic64_set(&eq->event_num, 0);
+	atomic64_set(&eq->notify_num, 0);
+
+	eq->db_addr = (u64 __iomem *)(dev->func_bar + ERDMA_REGS_AEQ_DB_REG);
+	eq->db_record = (u64 *)(eq->qbuf + buf_size);
+
+	erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_H_REG,
+			  upper_32_bits(eq->qbuf_dma_addr));
+	erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_L_REG,
+			  lower_32_bits(eq->qbuf_dma_addr));
+	erdma_reg_write32(dev, ERDMA_REGS_AEQ_DEPTH_REG, eq->depth);
+	erdma_reg_write64(dev, ERDMA_AEQ_DB_HOST_ADDR_REG,
+			  eq->qbuf_dma_addr + buf_size);
+
+	return 0;
+}
+
+void erdma_aeq_destroy(struct erdma_dev *dev)
+{
+	struct erdma_eq *eq = &dev->aeq;
+
+	dma_free_coherent(&dev->pdev->dev,
+			  WARPPED_BUFSIZE(eq->depth << EQE_SHIFT), eq->qbuf,
+			  eq->qbuf_dma_addr);
+}
+
+void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb)
+{
+	struct erdma_dev *dev = ceq_cb->dev;
+	struct erdma_cq *cq;
+	u32 poll_cnt = 0;
+	u64 *ceqe;
+	int cqn;
+
+	if (!ceq_cb->ready)
+		return;
+
+	while (poll_cnt < MAX_POLL_CHUNK_SIZE) {
+		ceqe = get_next_valid_eqe(&ceq_cb->eq);
+		if (!ceqe)
+			break;
+
+		dma_rmb();
+		ceq_cb->eq.ci++;
+		poll_cnt++;
+		cqn = FIELD_GET(ERDMA_CEQE_HDR_CQN_MASK, READ_ONCE(*ceqe));
+
+		cq = find_cq_by_cqn(dev, cqn);
+		if (!cq)
+			continue;
+
+		if (rdma_is_kernel_res(&cq->ibcq.res))
+			cq->kern_cq.cmdsn++;
+
+		if (cq->ibcq.comp_handler)
+			cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+	}
+
+	notify_eq(&ceq_cb->eq);
+}
+
+static irqreturn_t erdma_intr_ceq_handler(int irq, void *data)
+{
+	struct erdma_eq_cb *ceq_cb = data;
+
+	tasklet_schedule(&ceq_cb->tasklet);
+
+	return IRQ_HANDLED;
+}
+
+static void erdma_intr_ceq_task(unsigned long data)
+{
+	erdma_ceq_completion_handler((struct erdma_eq_cb *)data);
+}
+
+static int erdma_set_ceq_irq(struct erdma_dev *dev, u16 ceqn)
+{
+	struct erdma_eq_cb *eqc = &dev->ceqs[ceqn];
+	int err;
+
+	snprintf(eqc->irq.name, ERDMA_IRQNAME_SIZE, "erdma-ceq%u@pci:%s", ceqn,
+		 pci_name(dev->pdev));
+	eqc->irq.msix_vector = pci_irq_vector(dev->pdev, ceqn + 1);
+
+	tasklet_init(&dev->ceqs[ceqn].tasklet, erdma_intr_ceq_task,
+		     (unsigned long)&dev->ceqs[ceqn]);
+
+	cpumask_set_cpu(cpumask_local_spread(ceqn + 1, dev->attrs.numa_node),
+			&eqc->irq.affinity_hint_mask);
+
+	err = request_irq(eqc->irq.msix_vector, erdma_intr_ceq_handler, 0,
+			  eqc->irq.name, eqc);
+	if (err) {
+		dev_err(&dev->pdev->dev, "failed to request_irq(%d)\n", err);
+		return err;
+	}
+
+	irq_set_affinity_hint(eqc->irq.msix_vector,
+			      &eqc->irq.affinity_hint_mask);
+
+	return 0;
+}
+
+static void erdma_free_ceq_irq(struct erdma_dev *dev, u16 ceqn)
+{
+	struct erdma_eq_cb *eqc = &dev->ceqs[ceqn];
+
+	irq_set_affinity_hint(eqc->irq.msix_vector, NULL);
+	free_irq(eqc->irq.msix_vector, eqc);
+}
+
+static int create_eq_cmd(struct erdma_dev *dev, u32 eqn, struct erdma_eq *eq)
+{
+	struct erdma_cmdq_create_eq_req req;
+	dma_addr_t db_info_dma_addr;
+
+	erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON,
+				CMDQ_OPCODE_CREATE_EQ);
+	req.eqn = eqn;
+	req.depth = ilog2(eq->depth);
+	req.qbuf_addr = eq->qbuf_dma_addr;
+	req.qtype = ERDMA_EQ_TYPE_CEQ;
+	/* Vector index is the same as EQN. */
+	req.vector_idx = eqn;
+	db_info_dma_addr = eq->qbuf_dma_addr + (eq->depth << EQE_SHIFT);
+	req.db_dma_addr_l = lower_32_bits(db_info_dma_addr);
+	req.db_dma_addr_h = upper_32_bits(db_info_dma_addr);
+
+	return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req,
+				   sizeof(struct erdma_cmdq_create_eq_req),
+				   NULL, NULL);
+}
+
+static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn)
+{
+	struct erdma_eq *eq = &dev->ceqs[ceqn].eq;
+	u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT;
+	int ret;
+
+	eq->qbuf =
+		dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size),
+				   &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO);
+	if (!eq->qbuf)
+		return -ENOMEM;
+
+	spin_lock_init(&eq->lock);
+	atomic64_set(&eq->event_num, 0);
+	atomic64_set(&eq->notify_num, 0);
+
+	eq->depth = ERDMA_DEFAULT_EQ_DEPTH;
+	eq->db_addr =
+		(u64 __iomem *)(dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG +
+				(ceqn + 1) * ERDMA_DB_SIZE);
+	eq->db_record = (u64 *)(eq->qbuf + buf_size);
+	eq->ci = 0;
+	dev->ceqs[ceqn].dev = dev;
+
+	/* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */
+	ret = create_eq_cmd(dev, ceqn + 1, eq);
+	dev->ceqs[ceqn].ready = ret ? false : true;
+
+	return ret;
+}
+
+static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn)
+{
+	struct erdma_eq *eq = &dev->ceqs[ceqn].eq;
+	u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT;
+	struct erdma_cmdq_destroy_eq_req req;
+	int err;
+
+	dev->ceqs[ceqn].ready = 0;
+
+	erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON,
+				CMDQ_OPCODE_DESTROY_EQ);
+	/* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */
+	req.eqn = ceqn + 1;
+	req.qtype = ERDMA_EQ_TYPE_CEQ;
+	req.vector_idx = ceqn + 1;
+
+	err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL,
+				  NULL);
+	if (err)
+		return;
+
+	dma_free_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), eq->qbuf,
+			  eq->qbuf_dma_addr);
+}
+
+int erdma_ceqs_init(struct erdma_dev *dev)
+{
+	u32 i, j;
+	int err;
+
+	for (i = 0; i < dev->attrs.irq_num - 1; i++) {
+		err = erdma_ceq_init_one(dev, i);
+		if (err)
+			goto out_err;
+
+		err = erdma_set_ceq_irq(dev, i);
+		if (err) {
+			erdma_ceq_uninit_one(dev, i);
+			goto out_err;
+		}
+	}
+
+	return 0;
+
+out_err:
+	for (j = 0; j < i; j++) {
+		erdma_free_ceq_irq(dev, j);
+		erdma_ceq_uninit_one(dev, j);
+	}
+
+	return err;
+}
+
+void erdma_ceqs_uninit(struct erdma_dev *dev)
+{
+	u32 i;
+
+	for (i = 0; i < dev->attrs.irq_num - 1; i++) {
+		erdma_free_ceq_irq(dev, i);
+		erdma_ceq_uninit_one(dev, i);
+	}
+}
diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h
new file mode 100644
index 000000000000..b210c49c669f
--- /dev/null
+++ b/drivers/infiniband/hw/erdma/erdma_hw.h
@@ -0,0 +1,508 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+
+/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
+/*          Kai Shen <kaishen@linux.alibaba.com> */
+/* Copyright (c) 2020-2022, Alibaba Group. */
+
+#ifndef __ERDMA_HW_H__
+#define __ERDMA_HW_H__
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+/* PCIe device related definition. */
+#define PCI_VENDOR_ID_ALIBABA 0x1ded
+
+#define ERDMA_PCI_WIDTH 64
+#define ERDMA_FUNC_BAR 0
+#define ERDMA_MISX_BAR 2
+
+#define ERDMA_BAR_MASK (BIT(ERDMA_FUNC_BAR) | BIT(ERDMA_MISX_BAR))
+
+/* MSI-X related. */
+#define ERDMA_NUM_MSIX_VEC 32U
+#define ERDMA_MSIX_VECTOR_CMDQ 0
+
+/* PCIe Bar0 Registers. */
+#define ERDMA_REGS_VERSION_REG 0x0
+#define ERDMA_REGS_DEV_CTRL_REG 0x10
+#define ERDMA_REGS_DEV_ST_REG 0x14
+#define ERDMA_REGS_NETDEV_MAC_L_REG 0x18
+#define ERDMA_REGS_NETDEV_MAC_H_REG 0x1C
+#define ERDMA_REGS_CMDQ_SQ_ADDR_L_REG 0x20
+#define ERDMA_REGS_CMDQ_SQ_ADDR_H_REG 0x24
+#define ERDMA_REGS_CMDQ_CQ_ADDR_L_REG 0x28
+#define ERDMA_REGS_CMDQ_CQ_ADDR_H_REG 0x2C
+#define ERDMA_REGS_CMDQ_DEPTH_REG 0x30
+#define ERDMA_REGS_CMDQ_EQ_DEPTH_REG 0x34
+#define ERDMA_REGS_CMDQ_EQ_ADDR_L_REG 0x38
+#define ERDMA_REGS_CMDQ_EQ_ADDR_H_REG 0x3C
+#define ERDMA_REGS_AEQ_ADDR_L_REG 0x40
+#define ERDMA_REGS_AEQ_ADDR_H_REG 0x44
+#define ERDMA_REGS_AEQ_DEPTH_REG 0x48
+#define ERDMA_REGS_GRP_NUM_REG 0x4c
+#define ERDMA_REGS_AEQ_DB_REG 0x50
+#define ERDMA_CMDQ_SQ_DB_HOST_ADDR_REG 0x60
+#define ERDMA_CMDQ_CQ_DB_HOST_ADDR_REG 0x68
+#define ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG 0x70
+#define ERDMA_AEQ_DB_HOST_ADDR_REG 0x78
+#define ERDMA_REGS_STATS_TSO_IN_PKTS_REG 0x80
+#define ERDMA_REGS_STATS_TSO_OUT_PKTS_REG 0x88
+#define ERDMA_REGS_STATS_TSO_OUT_BYTES_REG 0x90
+#define ERDMA_REGS_STATS_TX_DROP_PKTS_REG 0x98
+#define ERDMA_REGS_STATS_TX_BPS_METER_DROP_PKTS_REG 0xa0
+#define ERDMA_REGS_STATS_TX_PPS_METER_DROP_PKTS_REG 0xa8
+#define ERDMA_REGS_STATS_RX_PKTS_REG 0xc0
+#define ERDMA_REGS_STATS_RX_BYTES_REG 0xc8
+#define ERDMA_REGS_STATS_RX_DROP_PKTS_REG 0xd0
+#define ERDMA_REGS_STATS_RX_BPS_METER_DROP_PKTS_REG 0xd8
+#define ERDMA_REGS_STATS_RX_PPS_METER_DROP_PKTS_REG 0xe0
+#define ERDMA_REGS_CEQ_DB_BASE_REG 0x100
+#define ERDMA_CMDQ_SQDB_REG 0x200
+#define ERDMA_CMDQ_CQDB_REG 0x300
+
+/* DEV_CTRL_REG details. */
+#define ERDMA_REG_DEV_CTRL_RESET_MASK 0x00000001
+#define ERDMA_REG_DEV_CTRL_INIT_MASK 0x00000002
+
+/* DEV_ST_REG details. */
+#define ERDMA_REG_DEV_ST_RESET_DONE_MASK 0x00000001U
+#define ERDMA_REG_DEV_ST_INIT_DONE_MASK 0x00000002U
+
+/* eRDMA PCIe DBs definition. */
+#define ERDMA_BAR_DB_SPACE_BASE 4096
+
+#define ERDMA_BAR_SQDB_SPACE_OFFSET ERDMA_BAR_DB_SPACE_BASE
+#define ERDMA_BAR_SQDB_SPACE_SIZE (384 * 1024)
+
+#define ERDMA_BAR_RQDB_SPACE_OFFSET \
+	(ERDMA_BAR_SQDB_SPACE_OFFSET + ERDMA_BAR_SQDB_SPACE_SIZE)
+#define ERDMA_BAR_RQDB_SPACE_SIZE (96 * 1024)
+
+#define ERDMA_BAR_CQDB_SPACE_OFFSET \
+	(ERDMA_BAR_RQDB_SPACE_OFFSET + ERDMA_BAR_RQDB_SPACE_SIZE)
+
+/* Doorbell page resources related. */
+/*
+ * Max # of parallelly issued directSQE is 3072 per device,
+ * hardware organizes this into 24 group, per group has 128 credits.
+ */
+#define ERDMA_DWQE_MAX_GRP_CNT 24
+#define ERDMA_DWQE_NUM_PER_GRP 128
+
+#define ERDMA_DWQE_TYPE0_CNT 64
+#define ERDMA_DWQE_TYPE1_CNT 496
+/* type1 DB contains 2 DBs, takes 256Byte. */
+#define ERDMA_DWQE_TYPE1_CNT_PER_PAGE 16
+
+#define ERDMA_SDB_SHARED_PAGE_INDEX 95
+
+/* Doorbell related. */
+#define ERDMA_DB_SIZE 8
+
+#define ERDMA_CQDB_IDX_MASK GENMASK_ULL(63, 56)
+#define ERDMA_CQDB_CQN_MASK GENMASK_ULL(55, 32)
+#define ERDMA_CQDB_ARM_MASK BIT_ULL(31)
+#define ERDMA_CQDB_SOL_MASK BIT_ULL(30)
+#define ERDMA_CQDB_CMDSN_MASK GENMASK_ULL(29, 28)
+#define ERDMA_CQDB_CI_MASK GENMASK_ULL(23, 0)
+
+#define ERDMA_EQDB_ARM_MASK BIT(31)
+#define ERDMA_EQDB_CI_MASK GENMASK_ULL(23, 0)
+
+#define ERDMA_PAGE_SIZE_SUPPORT 0x7FFFF000
+
+/* WQE related. */
+#define EQE_SIZE 16
+#define EQE_SHIFT 4
+#define RQE_SIZE 32
+#define RQE_SHIFT 5
+#define CQE_SIZE 32
+#define CQE_SHIFT 5
+#define SQEBB_SIZE 32
+#define SQEBB_SHIFT 5
+#define SQEBB_MASK (~(SQEBB_SIZE - 1))
+#define SQEBB_ALIGN(size) ((size + SQEBB_SIZE - 1) & SQEBB_MASK)
+#define SQEBB_COUNT(size) (SQEBB_ALIGN(size) >> SQEBB_SHIFT)
+
+#define ERDMA_MAX_SQE_SIZE 128
+#define ERDMA_MAX_WQEBB_PER_SQE 4
+
+/* CMDQ related. */
+#define ERDMA_CMDQ_MAX_OUTSTANDING 128
+#define ERDMA_CMDQ_SQE_SIZE 64
+
+/* cmdq sub module definition. */
+enum CMDQ_WQE_SUB_MOD {
+	CMDQ_SUBMOD_RDMA = 0,
+	CMDQ_SUBMOD_COMMON = 1
+};
+
+enum CMDQ_RDMA_OPCODE {
+	CMDQ_OPCODE_QUERY_DEVICE = 0,
+	CMDQ_OPCODE_CREATE_QP = 1,
+	CMDQ_OPCODE_DESTROY_QP = 2,
+	CMDQ_OPCODE_MODIFY_QP = 3,
+	CMDQ_OPCODE_CREATE_CQ = 4,
+	CMDQ_OPCODE_DESTROY_CQ = 5,
+	CMDQ_OPCODE_REG_MR = 8,
+	CMDQ_OPCODE_DEREG_MR = 9
+};
+
+enum CMDQ_COMMON_OPCODE {
+	CMDQ_OPCODE_CREATE_EQ = 0,
+	CMDQ_OPCODE_DESTROY_EQ = 1,
+	CMDQ_OPCODE_QUERY_FW_INFO = 2,
+};
+
+/* cmdq-SQE HDR */
+#define ERDMA_CMD_HDR_WQEBB_CNT_MASK GENMASK_ULL(54, 52)
+#define ERDMA_CMD_HDR_CONTEXT_COOKIE_MASK GENMASK_ULL(47, 32)
+#define ERDMA_CMD_HDR_SUB_MOD_MASK GENMASK_ULL(25, 24)
+#define ERDMA_CMD_HDR_OPCODE_MASK GENMASK_ULL(23, 16)
+#define ERDMA_CMD_HDR_WQEBB_INDEX_MASK GENMASK_ULL(15, 0)
+
+struct erdma_cmdq_destroy_cq_req {
+	u64 hdr;
+	u32 cqn;
+};
+
+#define ERDMA_EQ_TYPE_AEQ 0
+#define ERDMA_EQ_TYPE_CEQ 1
+
+struct erdma_cmdq_create_eq_req {
+	u64 hdr;
+	u64 qbuf_addr;
+	u8 vector_idx;
+	u8 eqn;
+	u8 depth;
+	u8 qtype;
+	u32 db_dma_addr_l;
+	u32 db_dma_addr_h;
+};
+
+struct erdma_cmdq_destroy_eq_req {
+	u64 hdr;
+	u64 rsvd0;
+	u8 vector_idx;
+	u8 eqn;
+	u8 rsvd1;
+	u8 qtype;
+};
+
+/* create_cq cfg0 */
+#define ERDMA_CMD_CREATE_CQ_DEPTH_MASK GENMASK(31, 24)
+#define ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK GENMASK(23, 20)
+#define ERDMA_CMD_CREATE_CQ_CQN_MASK GENMASK(19, 0)
+
+/* create_cq cfg1 */
+#define ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK GENMASK(31, 16)
+#define ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK BIT(15)
+#define ERDMA_CMD_CREATE_CQ_EQN_MASK GENMASK(9, 0)
+
+struct erdma_cmdq_create_cq_req {
+	u64 hdr;
+	u32 cfg0;
+	u32 qbuf_addr_l;
+	u32 qbuf_addr_h;
+	u32 cfg1;
+	u64 cq_db_info_addr;
+	u32 first_page_offset;
+};
+
+/* regmr/deregmr cfg0 */
+#define ERDMA_CMD_MR_VALID_MASK BIT(31)
+#define ERDMA_CMD_MR_KEY_MASK GENMASK(27, 20)
+#define ERDMA_CMD_MR_MPT_IDX_MASK GENMASK(19, 0)
+
+/* regmr cfg1 */
+#define ERDMA_CMD_REGMR_PD_MASK GENMASK(31, 12)
+#define ERDMA_CMD_REGMR_TYPE_MASK GENMASK(7, 6)
+#define ERDMA_CMD_REGMR_RIGHT_MASK GENMASK(5, 2)
+#define ERDMA_CMD_REGMR_ACC_MODE_MASK GENMASK(1, 0)
+
+/* regmr cfg2 */
+#define ERDMA_CMD_REGMR_PAGESIZE_MASK GENMASK(31, 27)
+#define ERDMA_CMD_REGMR_MTT_TYPE_MASK GENMASK(21, 20)
+#define ERDMA_CMD_REGMR_MTT_CNT_MASK GENMASK(19, 0)
+
+struct erdma_cmdq_reg_mr_req {
+	u64 hdr;
+	u32 cfg0;
+	u32 cfg1;
+	u64 start_va;
+	u32 size;
+	u32 cfg2;
+	u64 phy_addr[4];
+};
+
+struct erdma_cmdq_dereg_mr_req {
+	u64 hdr;
+	u32 cfg;
+};
+
+/* modify qp cfg */
+#define ERDMA_CMD_MODIFY_QP_STATE_MASK GENMASK(31, 24)
+#define ERDMA_CMD_MODIFY_QP_CC_MASK GENMASK(23, 20)
+#define ERDMA_CMD_MODIFY_QP_QPN_MASK GENMASK(19, 0)
+
+struct erdma_cmdq_modify_qp_req {
+	u64 hdr;
+	u32 cfg;
+	u32 cookie;
+	__be32 dip;
+	__be32 sip;
+	__be16 sport;
+	__be16 dport;
+	u32 send_nxt;
+	u32 recv_nxt;
+};
+
+/* create qp cfg0 */
+#define ERDMA_CMD_CREATE_QP_SQ_DEPTH_MASK GENMASK(31, 20)
+#define ERDMA_CMD_CREATE_QP_QPN_MASK GENMASK(19, 0)
+
+/* create qp cfg1 */
+#define ERDMA_CMD_CREATE_QP_RQ_DEPTH_MASK GENMASK(31, 20)
+#define ERDMA_CMD_CREATE_QP_PD_MASK GENMASK(19, 0)
+
+/* create qp cqn_mtt_cfg */
+#define ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK GENMASK(31, 28)
+#define ERDMA_CMD_CREATE_QP_CQN_MASK GENMASK(23, 0)
+
+/* create qp mtt_cfg */
+#define ERDMA_CMD_CREATE_QP_PAGE_OFFSET_MASK GENMASK(31, 12)
+#define ERDMA_CMD_CREATE_QP_MTT_CNT_MASK GENMASK(11, 1)
+#define ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK BIT(0)
+
+#define ERDMA_CMDQ_CREATE_QP_RESP_COOKIE_MASK GENMASK_ULL(31, 0)
+
+struct erdma_cmdq_create_qp_req {
+	u64 hdr;
+	u32 cfg0;
+	u32 cfg1;
+	u32 sq_cqn_mtt_cfg;
+	u32 rq_cqn_mtt_cfg;
+	u64 sq_buf_addr;
+	u64 rq_buf_addr;
+	u32 sq_mtt_cfg;
+	u32 rq_mtt_cfg;
+	u64 sq_db_info_dma_addr;
+	u64 rq_db_info_dma_addr;
+};
+
+struct erdma_cmdq_destroy_qp_req {
+	u64 hdr;
+	u32 qpn;
+};
+
+/* cap qword 0 definition */
+#define ERDMA_CMD_DEV_CAP_MAX_CQE_MASK GENMASK_ULL(47, 40)
+#define ERDMA_CMD_DEV_CAP_MAX_RECV_WR_MASK GENMASK_ULL(23, 16)
+#define ERDMA_CMD_DEV_CAP_MAX_MR_SIZE_MASK GENMASK_ULL(7, 0)
+
+/* cap qword 1 definition */
+#define ERDMA_CMD_DEV_CAP_DMA_LOCAL_KEY_MASK GENMASK_ULL(63, 32)
+#define ERDMA_CMD_DEV_CAP_DEFAULT_CC_MASK GENMASK_ULL(31, 28)
+#define ERDMA_CMD_DEV_CAP_QBLOCK_MASK GENMASK_ULL(27, 16)
+#define ERDMA_CMD_DEV_CAP_MAX_MW_MASK GENMASK_ULL(7, 0)
+
+#define ERDMA_NQP_PER_QBLOCK 1024
+
+#define ERDMA_CMD_INFO0_FW_VER_MASK GENMASK_ULL(31, 0)
+
+/* CQE hdr */
+#define ERDMA_CQE_HDR_OWNER_MASK BIT(31)
+#define ERDMA_CQE_HDR_OPCODE_MASK GENMASK(23, 16)
+#define ERDMA_CQE_HDR_QTYPE_MASK GENMASK(15, 8)
+#define ERDMA_CQE_HDR_SYNDROME_MASK GENMASK(7, 0)
+
+#define ERDMA_CQE_QTYPE_SQ 0
+#define ERDMA_CQE_QTYPE_RQ 1
+#define ERDMA_CQE_QTYPE_CMDQ 2
+
+struct erdma_cqe {
+	__be32 hdr;
+	__be32 qe_idx;
+	__be32 qpn;
+	union {
+		__le32 imm_data;
+		__be32 inv_rkey;
+	};
+	__be32 size;
+	__be32 rsvd[3];
+};
+
+struct erdma_sge {
+	__aligned_le64 laddr;
+	__le32 length;
+	__le32 lkey;
+};
+
+/* Receive Queue Element */
+struct erdma_rqe {
+	__le16 qe_idx;
+	__le16 rsvd0;
+	__le32 qpn;
+	__le32 rsvd1;
+	__le32 rsvd2;
+	__le64 to;
+	__le32 length;
+	__le32 stag;
+};
+
+/* SQE */
+#define ERDMA_SQE_HDR_SGL_LEN_MASK GENMASK_ULL(63, 56)
+#define ERDMA_SQE_HDR_WQEBB_CNT_MASK GENMASK_ULL(54, 52)
+#define ERDMA_SQE_HDR_QPN_MASK GENMASK_ULL(51, 32)
+#define ERDMA_SQE_HDR_OPCODE_MASK GENMASK_ULL(31, 27)
+#define ERDMA_SQE_HDR_DWQE_MASK BIT_ULL(26)
+#define ERDMA_SQE_HDR_INLINE_MASK BIT_ULL(25)
+#define ERDMA_SQE_HDR_FENCE_MASK BIT_ULL(24)
+#define ERDMA_SQE_HDR_SE_MASK BIT_ULL(23)
+#define ERDMA_SQE_HDR_CE_MASK BIT_ULL(22)
+#define ERDMA_SQE_HDR_WQEBB_INDEX_MASK GENMASK_ULL(15, 0)
+
+/* REG MR attrs */
+#define ERDMA_SQE_MR_MODE_MASK GENMASK(1, 0)
+#define ERDMA_SQE_MR_ACCESS_MASK GENMASK(5, 2)
+#define ERDMA_SQE_MR_MTT_TYPE_MASK GENMASK(7, 6)
+#define ERDMA_SQE_MR_MTT_CNT_MASK GENMASK(31, 12)
+
+struct erdma_write_sqe {
+	__le64 hdr;
+	__be32 imm_data;
+	__le32 length;
+
+	__le32 sink_stag;
+	__le32 sink_to_l;
+	__le32 sink_to_h;
+
+	__le32 rsvd;
+
+	struct erdma_sge sgl[0];
+};
+
+struct erdma_send_sqe {
+	__le64 hdr;
+	union {
+		__be32 imm_data;
+		__le32 invalid_stag;
+	};
+
+	__le32 length;
+	struct erdma_sge sgl[0];
+};
+
+struct erdma_readreq_sqe {
+	__le64 hdr;
+	__le32 invalid_stag;
+	__le32 length;
+	__le32 sink_stag;
+	__le32 sink_to_l;
+	__le32 sink_to_h;
+	__le32 rsvd;
+};
+
+struct erdma_reg_mr_sqe {
+	__le64 hdr;
+	__le64 addr;
+	__le32 length;
+	__le32 stag;
+	__le32 attrs;
+	__le32 rsvd;
+};
+
+/* EQ related. */
+#define ERDMA_DEFAULT_EQ_DEPTH 256
+
+/* ceqe */
+#define ERDMA_CEQE_HDR_DB_MASK BIT_ULL(63)
+#define ERDMA_CEQE_HDR_PI_MASK GENMASK_ULL(55, 32)
+#define ERDMA_CEQE_HDR_O_MASK BIT_ULL(31)
+#define ERDMA_CEQE_HDR_CQN_MASK GENMASK_ULL(19, 0)
+
+/* aeqe */
+#define ERDMA_AEQE_HDR_O_MASK BIT(31)
+#define ERDMA_AEQE_HDR_TYPE_MASK GENMASK(23, 16)
+#define ERDMA_AEQE_HDR_SUBTYPE_MASK GENMASK(7, 0)
+
+#define ERDMA_AE_TYPE_QP_FATAL_EVENT 0
+#define ERDMA_AE_TYPE_QP_ERQ_ERR_EVENT 1
+#define ERDMA_AE_TYPE_ACC_ERR_EVENT 2
+#define ERDMA_AE_TYPE_CQ_ERR 3
+#define ERDMA_AE_TYPE_OTHER_ERROR 4
+
+struct erdma_aeqe {
+	__le32 hdr;
+	__le32 event_data0;
+	__le32 event_data1;
+	__le32 rsvd;
+};
+
+enum erdma_opcode {
+	ERDMA_OP_WRITE = 0,
+	ERDMA_OP_READ = 1,
+	ERDMA_OP_SEND = 2,
+	ERDMA_OP_SEND_WITH_IMM = 3,
+
+	ERDMA_OP_RECEIVE = 4,
+	ERDMA_OP_RECV_IMM = 5,
+	ERDMA_OP_RECV_INV = 6,
+
+	ERDMA_OP_REQ_ERR = 7,
+	ERDMA_OP_READ_RESPONSE = 8,
+	ERDMA_OP_WRITE_WITH_IMM = 9,
+
+	ERDMA_OP_RECV_ERR = 10,
+
+	ERDMA_OP_INVALIDATE = 11,
+	ERDMA_OP_RSP_SEND_IMM = 12,
+	ERDMA_OP_SEND_WITH_INV = 13,
+
+	ERDMA_OP_REG_MR = 14,
+	ERDMA_OP_LOCAL_INV = 15,
+	ERDMA_OP_READ_WITH_INV = 16,
+	ERDMA_NUM_OPCODES = 17,
+	ERDMA_OP_INVALID = ERDMA_NUM_OPCODES + 1
+};
+
+enum erdma_wc_status {
+	ERDMA_WC_SUCCESS = 0,
+	ERDMA_WC_GENERAL_ERR = 1,
+	ERDMA_WC_RECV_WQE_FORMAT_ERR = 2,
+	ERDMA_WC_RECV_STAG_INVALID_ERR = 3,
+	ERDMA_WC_RECV_ADDR_VIOLATION_ERR = 4,
+	ERDMA_WC_RECV_RIGHT_VIOLATION_ERR = 5,
+	ERDMA_WC_RECV_PDID_ERR = 6,
+	ERDMA_WC_RECV_WARRPING_ERR = 7,
+	ERDMA_WC_SEND_WQE_FORMAT_ERR = 8,
+	ERDMA_WC_SEND_WQE_ORD_EXCEED = 9,
+	ERDMA_WC_SEND_STAG_INVALID_ERR = 10,
+	ERDMA_WC_SEND_ADDR_VIOLATION_ERR = 11,
+	ERDMA_WC_SEND_RIGHT_VIOLATION_ERR = 12,
+	ERDMA_WC_SEND_PDID_ERR = 13,
+	ERDMA_WC_SEND_WARRPING_ERR = 14,
+	ERDMA_WC_FLUSH_ERR = 15,
+	ERDMA_WC_RETRY_EXC_ERR = 16,
+	ERDMA_NUM_WC_STATUS
+};
+
+enum erdma_vendor_err {
+	ERDMA_WC_VENDOR_NO_ERR = 0,
+	ERDMA_WC_VENDOR_INVALID_RQE = 1,
+	ERDMA_WC_VENDOR_RQE_INVALID_STAG = 2,
+	ERDMA_WC_VENDOR_RQE_ADDR_VIOLATION = 3,
+	ERDMA_WC_VENDOR_RQE_ACCESS_RIGHT_ERR = 4,
+	ERDMA_WC_VENDOR_RQE_INVALID_PD = 5,
+	ERDMA_WC_VENDOR_RQE_WRAP_ERR = 6,
+	ERDMA_WC_VENDOR_INVALID_SQE = 0x20,
+	ERDMA_WC_VENDOR_ZERO_ORD = 0x21,
+	ERDMA_WC_VENDOR_SQE_INVALID_STAG = 0x30,
+	ERDMA_WC_VENDOR_SQE_ADDR_VIOLATION = 0x31,
+	ERDMA_WC_VENDOR_SQE_ACCESS_ERR = 0x32,
+	ERDMA_WC_VENDOR_SQE_INVALID_PD = 0x33,
+	ERDMA_WC_VENDOR_SQE_WARP_ERR = 0x34
+};
+
+#endif
diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c
new file mode 100644
index 000000000000..07e743d24847
--- /dev/null
+++ b/drivers/infiniband/hw/erdma/erdma_main.c
@@ -0,0 +1,608 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
+/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
+/*          Kai Shen <kaishen@linux.alibaba.com> */
+/* Copyright (c) 2020-2022, Alibaba Group. */
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+#include <net/addrconf.h>
+#include <rdma/erdma-abi.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "erdma.h"
+#include "erdma_cm.h"
+#include "erdma_hw.h"
+#include "erdma_verbs.h"
+
+MODULE_AUTHOR("Cheng Xu <chengyou@linux.alibaba.com>");
+MODULE_DESCRIPTION("Alibaba elasticRDMA adapter driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static int erdma_netdev_event(struct notifier_block *nb, unsigned long event,
+			      void *arg)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(arg);
+	struct erdma_dev *dev = container_of(nb, struct erdma_dev, netdev_nb);
+
+	if (dev->netdev == NULL || dev->netdev != netdev)
+		goto done;
+
+	switch (event) {
+	case NETDEV_UP:
+		dev->state = IB_PORT_ACTIVE;
+		erdma_port_event(dev, IB_EVENT_PORT_ACTIVE);
+		break;
+	case NETDEV_DOWN:
+		dev->state = IB_PORT_DOWN;
+		erdma_port_event(dev, IB_EVENT_PORT_ERR);
+		break;
+	case NETDEV_REGISTER:
+	case NETDEV_UNREGISTER:
+	case NETDEV_CHANGEADDR:
+	case NETDEV_CHANGEMTU:
+	case NETDEV_GOING_DOWN:
+	case NETDEV_CHANGE:
+	default:
+		break;
+	}
+
+done:
+	return NOTIFY_OK;
+}
+
+static int erdma_enum_and_get_netdev(struct erdma_dev *dev)
+{
+	struct net_device *netdev;
+	int ret = -ENODEV;
+
+	/* Already binded to a net_device, so we skip. */
+	if (dev->netdev)
+		return 0;
+
+	rtnl_lock();
+	for_each_netdev(&init_net, netdev) {
+		/*
+		 * In erdma, the paired netdev and ibdev should have the same
+		 * MAC address. erdma can get the value from its PCIe bar
+		 * registers. Since erdma can not get the paired netdev
+		 * reference directly, we do a traverse here to get the paired
+		 * netdev.
+		 */
+		if (ether_addr_equal_unaligned(netdev->perm_addr,
+					       dev->attrs.peer_addr)) {
+			ret = ib_device_set_netdev(&dev->ibdev, netdev, 1);
+			if (ret) {
+				rtnl_unlock();
+				ibdev_warn(&dev->ibdev,
+					   "failed (%d) to link netdev", ret);
+				return ret;
+			}
+
+			dev->netdev = netdev;
+			break;
+		}
+	}
+
+	rtnl_unlock();
+
+	return ret;
+}
+
+static int erdma_device_register(struct erdma_dev *dev)
+{
+	struct ib_device *ibdev = &dev->ibdev;
+	int ret;
+
+	ret = erdma_enum_and_get_netdev(dev);
+	if (ret)
+		return ret;
+
+	addrconf_addr_eui48((u8 *)&ibdev->node_guid, dev->netdev->dev_addr);
+
+	ret = ib_register_device(ibdev, "erdma_%d", &dev->pdev->dev);
+	if (ret) {
+		dev_err(&dev->pdev->dev,
+			"ib_register_device failed: ret = %d\n", ret);
+		return ret;
+	}
+
+	dev->netdev_nb.notifier_call = erdma_netdev_event;
+	ret = register_netdevice_notifier(&dev->netdev_nb);
+	if (ret) {
+		ibdev_err(&dev->ibdev, "failed to register notifier.\n");
+		ib_unregister_device(ibdev);
+	}
+
+	return ret;
+}
+
+static irqreturn_t erdma_comm_irq_handler(int irq, void *data)
+{
+	struct erdma_dev *dev = data;
+
+	erdma_cmdq_completion_handler(&dev->cmdq);
+	erdma_aeq_event_handler(dev);
+
+	return IRQ_HANDLED;
+}
+
+static void erdma_dwqe_resource_init(struct erdma_dev *dev)
+{
+	int total_pages, type0, type1;
+
+	dev->attrs.grp_num = erdma_reg_read32(dev, ERDMA_REGS_GRP_NUM_REG);
+
+	if (dev->attrs.grp_num < 4)
+		dev->attrs.disable_dwqe = true;
+	else
+		dev->attrs.disable_dwqe = false;
+
+	/* One page contains 4 goups. */
+	total_pages = dev->attrs.grp_num * 4;
+
+	if (dev->attrs.grp_num >= ERDMA_DWQE_MAX_GRP_CNT) {
+		dev->attrs.grp_num = ERDMA_DWQE_MAX_GRP_CNT;
+		type0 = ERDMA_DWQE_TYPE0_CNT;
+		type1 = ERDMA_DWQE_TYPE1_CNT / ERDMA_DWQE_TYPE1_CNT_PER_PAGE;
+	} else {
+		type1 = total_pages / 3;
+		type0 = total_pages - type1 - 1;
+	}
+
+	dev->attrs.dwqe_pages = type0;
+	dev->attrs.dwqe_entries = type1 * ERDMA_DWQE_TYPE1_CNT_PER_PAGE;
+}
+
+static int erdma_request_vectors(struct erdma_dev *dev)
+{
+	int expect_irq_num = min(num_possible_cpus() + 1, ERDMA_NUM_MSIX_VEC);
+	int ret;
+
+	ret = pci_alloc_irq_vectors(dev->pdev, 1, expect_irq_num, PCI_IRQ_MSIX);
+	if (ret < 0) {
+		dev_err(&dev->pdev->dev, "request irq vectors failed(%d)\n",
+			ret);
+		return ret;
+	}
+	dev->attrs.irq_num = ret;
+
+	return 0;
+}
+
+static int erdma_comm_irq_init(struct erdma_dev *dev)
+{
+	snprintf(dev->comm_irq.name, ERDMA_IRQNAME_SIZE, "erdma-common@pci:%s",
+		 pci_name(dev->pdev));
+	dev->comm_irq.msix_vector =
+		pci_irq_vector(dev->pdev, ERDMA_MSIX_VECTOR_CMDQ);
+
+	cpumask_set_cpu(cpumask_first(cpumask_of_pcibus(dev->pdev->bus)),
+			&dev->comm_irq.affinity_hint_mask);
+	irq_set_affinity_hint(dev->comm_irq.msix_vector,
+			      &dev->comm_irq.affinity_hint_mask);
+
+	return request_irq(dev->comm_irq.msix_vector, erdma_comm_irq_handler, 0,
+			   dev->comm_irq.name, dev);
+}
+
+static void erdma_comm_irq_uninit(struct erdma_dev *dev)
+{
+	irq_set_affinity_hint(dev->comm_irq.msix_vector, NULL);
+	free_irq(dev->comm_irq.msix_vector, dev);
+}
+
+static int erdma_device_init(struct erdma_dev *dev, struct pci_dev *pdev)
+{
+	int ret;
+
+	erdma_dwqe_resource_init(dev);
+
+	ret = dma_set_mask_and_coherent(&pdev->dev,
+					DMA_BIT_MASK(ERDMA_PCI_WIDTH));
+	if (ret)
+		return ret;
+
+	dma_set_max_seg_size(&pdev->dev, UINT_MAX);
+
+	return 0;
+}
+
+static void erdma_device_uninit(struct erdma_dev *dev)
+{
+	u32 ctrl = FIELD_PREP(ERDMA_REG_DEV_CTRL_RESET_MASK, 1);
+
+	erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG, ctrl);
+}
+
+static const struct pci_device_id erdma_pci_tbl[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_ALIBABA, 0x107f) },
+	{}
+};
+
+static int erdma_probe_dev(struct pci_dev *pdev)
+{
+	struct erdma_dev *dev;
+	int bars, err;
+	u32 version;
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "pci_enable_device failed(%d)\n", err);
+		return err;
+	}
+
+	pci_set_master(pdev);
+
+	dev = ib_alloc_device(erdma_dev, ibdev);
+	if (!dev) {
+		dev_err(&pdev->dev, "ib_alloc_device failed\n");
+		err = -ENOMEM;
+		goto err_disable_device;
+	}
+
+	pci_set_drvdata(pdev, dev);
+	dev->pdev = pdev;
+	dev->attrs.numa_node = dev_to_node(&pdev->dev);
+
+	bars = pci_select_bars(pdev, IORESOURCE_MEM);
+	err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME);
+	if (bars != ERDMA_BAR_MASK || err) {
+		err = err ? err : -EINVAL;
+		goto err_ib_device_release;
+	}
+
+	dev->func_bar_addr = pci_resource_start(pdev, ERDMA_FUNC_BAR);
+	dev->func_bar_len = pci_resource_len(pdev, ERDMA_FUNC_BAR);
+
+	dev->func_bar =
+		devm_ioremap(&pdev->dev, dev->func_bar_addr, dev->func_bar_len);
+	if (!dev->func_bar) {
+		dev_err(&pdev->dev, "devm_ioremap failed.\n");
+		err = -EFAULT;
+		goto err_release_bars;
+	}
+
+	version = erdma_reg_read32(dev, ERDMA_REGS_VERSION_REG);
+	if (version == 0) {
+		/* we knows that it is a non-functional function. */
+		err = -ENODEV;
+		goto err_iounmap_func_bar;
+	}
+
+	err = erdma_device_init(dev, pdev);
+	if (err)
+		goto err_iounmap_func_bar;
+
+	err = erdma_request_vectors(dev);
+	if (err)
+		goto err_iounmap_func_bar;
+
+	err = erdma_comm_irq_init(dev);
+	if (err)
+		goto err_free_vectors;
+
+	err = erdma_aeq_init(dev);
+	if (err)
+		goto err_uninit_comm_irq;
+
+	err = erdma_cmdq_init(dev);
+	if (err)
+		goto err_uninit_aeq;
+
+	err = erdma_ceqs_init(dev);
+	if (err)
+		goto err_uninit_cmdq;
+
+	erdma_finish_cmdq_init(dev);
+
+	return 0;
+
+err_uninit_cmdq:
+	erdma_device_uninit(dev);
+	erdma_cmdq_destroy(dev);
+
+err_uninit_aeq:
+	erdma_aeq_destroy(dev);
+
+err_uninit_comm_irq:
+	erdma_comm_irq_uninit(dev);
+
+err_free_vectors:
+	pci_free_irq_vectors(dev->pdev);
+
+err_iounmap_func_bar:
+	devm_iounmap(&pdev->dev, dev->func_bar);
+
+err_release_bars:
+	pci_release_selected_regions(pdev, bars);
+
+err_ib_device_release:
+	ib_dealloc_device(&dev->ibdev);
+
+err_disable_device:
+	pci_disable_device(pdev);
+
+	return err;
+}
+
+static void erdma_remove_dev(struct pci_dev *pdev)
+{
+	struct erdma_dev *dev = pci_get_drvdata(pdev);
+
+	erdma_ceqs_uninit(dev);
+
+	erdma_device_uninit(dev);
+
+	erdma_cmdq_destroy(dev);
+	erdma_aeq_destroy(dev);
+	erdma_comm_irq_uninit(dev);
+	pci_free_irq_vectors(dev->pdev);
+
+	devm_iounmap(&pdev->dev, dev->func_bar);
+	pci_release_selected_regions(pdev, ERDMA_BAR_MASK);
+
+	ib_dealloc_device(&dev->ibdev);
+
+	pci_disable_device(pdev);
+}
+
+#define ERDMA_GET_CAP(name, cap) FIELD_GET(ERDMA_CMD_DEV_CAP_##name##_MASK, cap)
+
+static int erdma_dev_attrs_init(struct erdma_dev *dev)
+{
+	int err;
+	u64 req_hdr, cap0, cap1;
+
+	erdma_cmdq_build_reqhdr(&req_hdr, CMDQ_SUBMOD_RDMA,
+				CMDQ_OPCODE_QUERY_DEVICE);
+
+	err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0,
+				  &cap1);
+	if (err)
+		return err;
+
+	dev->attrs.max_cqe = 1 << ERDMA_GET_CAP(MAX_CQE, cap0);
+	dev->attrs.max_mr_size = 1ULL << ERDMA_GET_CAP(MAX_MR_SIZE, cap0);
+	dev->attrs.max_mw = 1 << ERDMA_GET_CAP(MAX_MW, cap1);
+	dev->attrs.max_recv_wr = 1 << ERDMA_GET_CAP(MAX_RECV_WR, cap0);
+	dev->attrs.local_dma_key = ERDMA_GET_CAP(DMA_LOCAL_KEY, cap1);
+	dev->attrs.cc = ERDMA_GET_CAP(DEFAULT_CC, cap1);
+	dev->attrs.max_qp = ERDMA_NQP_PER_QBLOCK * ERDMA_GET_CAP(QBLOCK, cap1);
+	dev->attrs.max_mr = dev->attrs.max_qp << 1;
+	dev->attrs.max_cq = dev->attrs.max_qp << 1;
+
+	dev->attrs.max_send_wr = ERDMA_MAX_SEND_WR;
+	dev->attrs.max_ord = ERDMA_MAX_ORD;
+	dev->attrs.max_ird = ERDMA_MAX_IRD;
+	dev->attrs.max_send_sge = ERDMA_MAX_SEND_SGE;
+	dev->attrs.max_recv_sge = ERDMA_MAX_RECV_SGE;
+	dev->attrs.max_sge_rd = ERDMA_MAX_SGE_RD;
+	dev->attrs.max_pd = ERDMA_MAX_PD;
+
+	dev->res_cb[ERDMA_RES_TYPE_PD].max_cap = ERDMA_MAX_PD;
+	dev->res_cb[ERDMA_RES_TYPE_STAG_IDX].max_cap = dev->attrs.max_mr;
+
+	erdma_cmdq_build_reqhdr(&req_hdr, CMDQ_SUBMOD_COMMON,
+				CMDQ_OPCODE_QUERY_FW_INFO);
+
+	err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0,
+				  &cap1);
+	if (!err)
+		dev->attrs.fw_version =
+			FIELD_GET(ERDMA_CMD_INFO0_FW_VER_MASK, cap0);
+
+	return err;
+}
+
+static int erdma_res_cb_init(struct erdma_dev *dev)
+{
+	int i, j;
+
+	for (i = 0; i < ERDMA_RES_CNT; i++) {
+		dev->res_cb[i].next_alloc_idx = 1;
+		spin_lock_init(&dev->res_cb[i].lock);
+		dev->res_cb[i].bitmap =
+			bitmap_zalloc(dev->res_cb[i].max_cap, GFP_KERNEL);
+		if (!dev->res_cb[i].bitmap)
+			goto err;
+	}
+
+	return 0;
+
+err:
+	for (j = 0; j < i; j++)
+		bitmap_free(dev->res_cb[j].bitmap);
+
+	return -ENOMEM;
+}
+
+static void erdma_res_cb_free(struct erdma_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < ERDMA_RES_CNT; i++)
+		bitmap_free(dev->res_cb[i].bitmap);
+}
+
+static const struct ib_device_ops erdma_device_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_ERDMA,
+	.uverbs_abi_ver = ERDMA_ABI_VERSION,
+
+	.alloc_mr = erdma_ib_alloc_mr,
+	.alloc_pd = erdma_alloc_pd,
+	.alloc_ucontext = erdma_alloc_ucontext,
+	.create_cq = erdma_create_cq,
+	.create_qp = erdma_create_qp,
+	.dealloc_pd = erdma_dealloc_pd,
+	.dealloc_ucontext = erdma_dealloc_ucontext,
+	.dereg_mr = erdma_dereg_mr,
+	.destroy_cq = erdma_destroy_cq,
+	.destroy_qp = erdma_destroy_qp,
+	.get_dma_mr = erdma_get_dma_mr,
+	.get_port_immutable = erdma_get_port_immutable,
+	.iw_accept = erdma_accept,
+	.iw_add_ref = erdma_qp_get_ref,
+	.iw_connect = erdma_connect,
+	.iw_create_listen = erdma_create_listen,
+	.iw_destroy_listen = erdma_destroy_listen,
+	.iw_get_qp = erdma_get_ibqp,
+	.iw_reject = erdma_reject,
+	.iw_rem_ref = erdma_qp_put_ref,
+	.map_mr_sg = erdma_map_mr_sg,
+	.mmap = erdma_mmap,
+	.mmap_free = erdma_mmap_free,
+	.modify_qp = erdma_modify_qp,
+	.post_recv = erdma_post_recv,
+	.post_send = erdma_post_send,
+	.poll_cq = erdma_poll_cq,
+	.query_device = erdma_query_device,
+	.query_gid = erdma_query_gid,
+	.query_port = erdma_query_port,
+	.query_qp = erdma_query_qp,
+	.req_notify_cq = erdma_req_notify_cq,
+	.reg_user_mr = erdma_reg_user_mr,
+
+	INIT_RDMA_OBJ_SIZE(ib_cq, erdma_cq, ibcq),
+	INIT_RDMA_OBJ_SIZE(ib_pd, erdma_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, erdma_ucontext, ibucontext),
+	INIT_RDMA_OBJ_SIZE(ib_qp, erdma_qp, ibqp),
+};
+
+static int erdma_ib_device_add(struct pci_dev *pdev)
+{
+	struct erdma_dev *dev = pci_get_drvdata(pdev);
+	struct ib_device *ibdev = &dev->ibdev;
+	u64 mac;
+	int ret;
+
+	ret = erdma_dev_attrs_init(dev);
+	if (ret)
+		return ret;
+
+	ibdev->node_type = RDMA_NODE_RNIC;
+	memcpy(ibdev->node_desc, ERDMA_NODE_DESC, sizeof(ERDMA_NODE_DESC));
+
+	/*
+	 * Current model (one-to-one device association):
+	 * One ERDMA device per net_device or, equivalently,
+	 * per physical port.
+	 */
+	ibdev->phys_port_cnt = 1;
+	ibdev->num_comp_vectors = dev->attrs.irq_num - 1;
+
+	ib_set_device_ops(ibdev, &erdma_device_ops);
+
+	INIT_LIST_HEAD(&dev->cep_list);
+
+	spin_lock_init(&dev->lock);
+	xa_init_flags(&dev->qp_xa, XA_FLAGS_ALLOC1);
+	xa_init_flags(&dev->cq_xa, XA_FLAGS_ALLOC1);
+	dev->next_alloc_cqn = 1;
+	dev->next_alloc_qpn = 1;
+
+	ret = erdma_res_cb_init(dev);
+	if (ret)
+		return ret;
+
+	spin_lock_init(&dev->db_bitmap_lock);
+	bitmap_zero(dev->sdb_page, ERDMA_DWQE_TYPE0_CNT);
+	bitmap_zero(dev->sdb_entry, ERDMA_DWQE_TYPE1_CNT);
+
+	atomic_set(&dev->num_ctx, 0);
+
+	mac = erdma_reg_read32(dev, ERDMA_REGS_NETDEV_MAC_L_REG);
+	mac |= (u64)erdma_reg_read32(dev, ERDMA_REGS_NETDEV_MAC_H_REG) << 32;
+
+	u64_to_ether_addr(mac, dev->attrs.peer_addr);
+
+	ret = erdma_device_register(dev);
+	if (ret)
+		goto err_out;
+
+	return 0;
+
+err_out:
+	xa_destroy(&dev->qp_xa);
+	xa_destroy(&dev->cq_xa);
+
+	erdma_res_cb_free(dev);
+
+	return ret;
+}
+
+static void erdma_ib_device_remove(struct pci_dev *pdev)
+{
+	struct erdma_dev *dev = pci_get_drvdata(pdev);
+
+	unregister_netdevice_notifier(&dev->netdev_nb);
+	ib_unregister_device(&dev->ibdev);
+
+	erdma_res_cb_free(dev);
+	xa_destroy(&dev->qp_xa);
+	xa_destroy(&dev->cq_xa);
+}
+
+static int erdma_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int ret;
+
+	ret = erdma_probe_dev(pdev);
+	if (ret)
+		return ret;
+
+	ret = erdma_ib_device_add(pdev);
+	if (ret) {
+		erdma_remove_dev(pdev);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void erdma_remove(struct pci_dev *pdev)
+{
+	erdma_ib_device_remove(pdev);
+	erdma_remove_dev(pdev);
+}
+
+static struct pci_driver erdma_pci_driver = {
+	.name = DRV_MODULE_NAME,
+	.id_table = erdma_pci_tbl,
+	.probe = erdma_probe,
+	.remove = erdma_remove
+};
+
+MODULE_DEVICE_TABLE(pci, erdma_pci_tbl);
+
+static __init int erdma_init_module(void)
+{
+	int ret;
+
+	ret = erdma_cm_init();
+	if (ret)
+		return ret;
+
+	ret = pci_register_driver(&erdma_pci_driver);
+	if (ret)
+		erdma_cm_exit();
+
+	return ret;
+}
+
+static void __exit erdma_exit_module(void)
+{
+	pci_unregister_driver(&erdma_pci_driver);
+
+	erdma_cm_exit();
+}
+
+module_init(erdma_init_module);
+module_exit(erdma_exit_module);
diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c
new file mode 100644
index 000000000000..72f08171a28a
--- /dev/null
+++ b/drivers/infiniband/hw/erdma/erdma_qp.c
@@ -0,0 +1,566 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
+/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
+/*          Kai Shen <kaishen@linux.alibaba.com> */
+/* Copyright (c) 2020-2021, Alibaba Group */
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/scatterlist.h>
+#include <linux/types.h>
+
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+
+#include "erdma.h"
+#include "erdma_cm.h"
+#include "erdma_verbs.h"
+
+void erdma_qp_llp_close(struct erdma_qp *qp)
+{
+	struct erdma_qp_attrs qp_attrs;
+
+	down_write(&qp->state_lock);
+
+	switch (qp->attrs.state) {
+	case ERDMA_QP_STATE_RTS:
+	case ERDMA_QP_STATE_RTR:
+	case ERDMA_QP_STATE_IDLE:
+	case ERDMA_QP_STATE_TERMINATE:
+		qp_attrs.state = ERDMA_QP_STATE_CLOSING;
+		erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE);
+		break;
+	case ERDMA_QP_STATE_CLOSING:
+		qp->attrs.state = ERDMA_QP_STATE_IDLE;
+		break;
+	default:
+		break;
+	}
+
+	if (qp->cep) {
+		erdma_cep_put(qp->cep);
+		qp->cep = NULL;
+	}
+
+	up_write(&qp->state_lock);
+}
+
+struct ib_qp *erdma_get_ibqp(struct ib_device *ibdev, int id)
+{
+	struct erdma_qp *qp = find_qp_by_qpn(to_edev(ibdev), id);
+
+	if (qp)
+		return &qp->ibqp;
+
+	return NULL;
+}
+
+static int erdma_modify_qp_state_to_rts(struct erdma_qp *qp,
+					struct erdma_qp_attrs *attrs,
+					enum erdma_qp_attr_mask mask)
+{
+	int ret;
+	struct erdma_dev *dev = qp->dev;
+	struct erdma_cmdq_modify_qp_req req;
+	struct tcp_sock *tp;
+	struct erdma_cep *cep = qp->cep;
+	struct sockaddr_storage local_addr, remote_addr;
+
+	if (!(mask & ERDMA_QP_ATTR_LLP_HANDLE))
+		return -EINVAL;
+
+	if (!(mask & ERDMA_QP_ATTR_MPA))
+		return -EINVAL;
+
+	ret = getname_local(cep->sock, &local_addr);
+	if (ret < 0)
+		return ret;
+
+	ret = getname_peer(cep->sock, &remote_addr);
+	if (ret < 0)
+		return ret;
+
+	qp->attrs.state = ERDMA_QP_STATE_RTS;
+
+	tp = tcp_sk(qp->cep->sock->sk);
+
+	erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA,
+				CMDQ_OPCODE_MODIFY_QP);
+
+	req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, qp->attrs.state) |
+		  FIELD_PREP(ERDMA_CMD_MODIFY_QP_CC_MASK, qp->attrs.cc) |
+		  FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp));
+
+	req.cookie = be32_to_cpu(qp->cep->mpa.ext_data.cookie);
+	req.dip = to_sockaddr_in(remote_addr).sin_addr.s_addr;
+	req.sip = to_sockaddr_in(local_addr).sin_addr.s_addr;
+	req.dport = to_sockaddr_in(remote_addr).sin_port;
+	req.sport = to_sockaddr_in(local_addr).sin_port;
+
+	req.send_nxt = tp->snd_nxt;
+	/* rsvd tcp seq for mpa-rsp in server. */
+	if (qp->attrs.qp_type == ERDMA_QP_PASSIVE)
+		req.send_nxt += MPA_DEFAULT_HDR_LEN + qp->attrs.pd_len;
+	req.recv_nxt = tp->rcv_nxt;
+
+	return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL,
+				   NULL);
+}
+
+static int erdma_modify_qp_state_to_stop(struct erdma_qp *qp,
+					 struct erdma_qp_attrs *attrs,
+					 enum erdma_qp_attr_mask mask)
+{
+	struct erdma_dev *dev = qp->dev;
+	struct erdma_cmdq_modify_qp_req req;
+
+	qp->attrs.state = attrs->state;
+
+	erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA,
+				CMDQ_OPCODE_MODIFY_QP);
+
+	req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, attrs->state) |
+		  FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp));
+
+	return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL,
+				   NULL);
+}
+
+int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs,
+			     enum erdma_qp_attr_mask mask)
+{
+	int drop_conn, ret = 0;
+
+	if (!mask)
+		return 0;
+
+	if (!(mask & ERDMA_QP_ATTR_STATE))
+		return 0;
+
+	switch (qp->attrs.state) {
+	case ERDMA_QP_STATE_IDLE:
+	case ERDMA_QP_STATE_RTR:
+		if (attrs->state == ERDMA_QP_STATE_RTS) {
+			ret = erdma_modify_qp_state_to_rts(qp, attrs, mask);
+		} else if (attrs->state == ERDMA_QP_STATE_ERROR) {
+			qp->attrs.state = ERDMA_QP_STATE_ERROR;
+			if (qp->cep) {
+				erdma_cep_put(qp->cep);
+				qp->cep = NULL;
+			}
+			ret = erdma_modify_qp_state_to_stop(qp, attrs, mask);
+		}
+		break;
+	case ERDMA_QP_STATE_RTS:
+		drop_conn = 0;
+
+		if (attrs->state == ERDMA_QP_STATE_CLOSING) {
+			ret = erdma_modify_qp_state_to_stop(qp, attrs, mask);
+			drop_conn = 1;
+		} else if (attrs->state == ERDMA_QP_STATE_TERMINATE) {
+			qp->attrs.state = ERDMA_QP_STATE_TERMINATE;
+			ret = erdma_modify_qp_state_to_stop(qp, attrs, mask);
+			drop_conn = 1;
+		} else if (attrs->state == ERDMA_QP_STATE_ERROR) {
+			ret = erdma_modify_qp_state_to_stop(qp, attrs, mask);
+			qp->attrs.state = ERDMA_QP_STATE_ERROR;
+			drop_conn = 1;
+		}
+
+		if (drop_conn)
+			erdma_qp_cm_drop(qp);
+
+		break;
+	case ERDMA_QP_STATE_TERMINATE:
+		if (attrs->state == ERDMA_QP_STATE_ERROR)
+			qp->attrs.state = ERDMA_QP_STATE_ERROR;
+		break;
+	case ERDMA_QP_STATE_CLOSING:
+		if (attrs->state == ERDMA_QP_STATE_IDLE) {
+			qp->attrs.state = ERDMA_QP_STATE_IDLE;
+		} else if (attrs->state == ERDMA_QP_STATE_ERROR) {
+			ret = erdma_modify_qp_state_to_stop(qp, attrs, mask);
+			qp->attrs.state = ERDMA_QP_STATE_ERROR;
+		} else if (attrs->state != ERDMA_QP_STATE_CLOSING) {
+			return -ECONNABORTED;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+static void erdma_qp_safe_free(struct kref *ref)
+{
+	struct erdma_qp *qp = container_of(ref, struct erdma_qp, ref);
+
+	complete(&qp->safe_free);
+}
+
+void erdma_qp_put(struct erdma_qp *qp)
+{
+	WARN_ON(kref_read(&qp->ref) < 1);
+	kref_put(&qp->ref, erdma_qp_safe_free);
+}
+
+void erdma_qp_get(struct erdma_qp *qp)
+{
+	kref_get(&qp->ref);
+}
+
+static int fill_inline_data(struct erdma_qp *qp,
+			    const struct ib_send_wr *send_wr, u16 wqe_idx,
+			    u32 sgl_offset, __le32 *length_field)
+{
+	u32 remain_size, copy_size, data_off, bytes = 0;
+	char *data;
+	int i = 0;
+
+	wqe_idx += (sgl_offset >> SQEBB_SHIFT);
+	sgl_offset &= (SQEBB_SIZE - 1);
+	data = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx, qp->attrs.sq_size,
+			       SQEBB_SHIFT);
+
+	while (i < send_wr->num_sge) {
+		bytes += send_wr->sg_list[i].length;
+		if (bytes > (int)ERDMA_MAX_INLINE)
+			return -EINVAL;
+
+		remain_size = send_wr->sg_list[i].length;
+		data_off = 0;
+
+		while (1) {
+			copy_size = min(remain_size, SQEBB_SIZE - sgl_offset);
+
+			memcpy(data + sgl_offset,
+			       (void *)(uintptr_t)send_wr->sg_list[i].addr +
+				       data_off,
+			       copy_size);
+			remain_size -= copy_size;
+			data_off += copy_size;
+			sgl_offset += copy_size;
+			wqe_idx += (sgl_offset >> SQEBB_SHIFT);
+			sgl_offset &= (SQEBB_SIZE - 1);
+
+			data = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx,
+					       qp->attrs.sq_size, SQEBB_SHIFT);
+			if (!remain_size)
+				break;
+		}
+
+		i++;
+	}
+	*length_field = cpu_to_le32(bytes);
+
+	return bytes;
+}
+
+static int fill_sgl(struct erdma_qp *qp, const struct ib_send_wr *send_wr,
+		    u16 wqe_idx, u32 sgl_offset, __le32 *length_field)
+{
+	int i = 0;
+	u32 bytes = 0;
+	char *sgl;
+
+	if (send_wr->num_sge > qp->dev->attrs.max_send_sge)
+		return -EINVAL;
+
+	if (sgl_offset & 0xF)
+		return -EINVAL;
+
+	while (i < send_wr->num_sge) {
+		wqe_idx += (sgl_offset >> SQEBB_SHIFT);
+		sgl_offset &= (SQEBB_SIZE - 1);
+		sgl = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx,
+				      qp->attrs.sq_size, SQEBB_SHIFT);
+
+		bytes += send_wr->sg_list[i].length;
+		memcpy(sgl + sgl_offset, &send_wr->sg_list[i],
+		       sizeof(struct ib_sge));
+
+		sgl_offset += sizeof(struct ib_sge);
+		i++;
+	}
+
+	*length_field = cpu_to_le32(bytes);
+	return 0;
+}
+
+static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi,
+			      const struct ib_send_wr *send_wr)
+{
+	u32 wqe_size, wqebb_cnt, hw_op, flags, sgl_offset;
+	u32 idx = *pi & (qp->attrs.sq_size - 1);
+	enum ib_wr_opcode op = send_wr->opcode;
+	struct erdma_readreq_sqe *read_sqe;
+	struct erdma_reg_mr_sqe *regmr_sge;
+	struct erdma_write_sqe *write_sqe;
+	struct erdma_send_sqe *send_sqe;
+	struct ib_rdma_wr *rdma_wr;
+	struct erdma_mr *mr;
+	__le32 *length_field;
+	u64 wqe_hdr, *entry;
+	struct ib_sge *sge;
+	u32 attrs;
+	int ret;
+
+	entry = get_queue_entry(qp->kern_qp.sq_buf, idx, qp->attrs.sq_size,
+				SQEBB_SHIFT);
+
+	/* Clear the SQE header section. */
+	*entry = 0;
+
+	qp->kern_qp.swr_tbl[idx] = send_wr->wr_id;
+	flags = send_wr->send_flags;
+	wqe_hdr = FIELD_PREP(
+		ERDMA_SQE_HDR_CE_MASK,
+		((flags & IB_SEND_SIGNALED) || qp->kern_qp.sig_all) ? 1 : 0);
+	wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SE_MASK,
+			      flags & IB_SEND_SOLICITED ? 1 : 0);
+	wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_FENCE_MASK,
+			      flags & IB_SEND_FENCE ? 1 : 0);
+	wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_INLINE_MASK,
+			      flags & IB_SEND_INLINE ? 1 : 0);
+	wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, QP_ID(qp));
+
+	switch (op) {
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		hw_op = ERDMA_OP_WRITE;
+		if (op == IB_WR_RDMA_WRITE_WITH_IMM)
+			hw_op = ERDMA_OP_WRITE_WITH_IMM;
+		wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op);
+		rdma_wr = container_of(send_wr, struct ib_rdma_wr, wr);
+		write_sqe = (struct erdma_write_sqe *)entry;
+
+		write_sqe->imm_data = send_wr->ex.imm_data;
+		write_sqe->sink_stag = cpu_to_le32(rdma_wr->rkey);
+		write_sqe->sink_to_h =
+			cpu_to_le32(upper_32_bits(rdma_wr->remote_addr));
+		write_sqe->sink_to_l =
+			cpu_to_le32(lower_32_bits(rdma_wr->remote_addr));
+
+		length_field = &write_sqe->length;
+		wqe_size = sizeof(struct erdma_write_sqe);
+		sgl_offset = wqe_size;
+		break;
+	case IB_WR_RDMA_READ:
+	case IB_WR_RDMA_READ_WITH_INV:
+		read_sqe = (struct erdma_readreq_sqe *)entry;
+		if (unlikely(send_wr->num_sge != 1))
+			return -EINVAL;
+		hw_op = ERDMA_OP_READ;
+		if (op == IB_WR_RDMA_READ_WITH_INV) {
+			hw_op = ERDMA_OP_READ_WITH_INV;
+			read_sqe->invalid_stag =
+				cpu_to_le32(send_wr->ex.invalidate_rkey);
+		}
+
+		wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op);
+		rdma_wr = container_of(send_wr, struct ib_rdma_wr, wr);
+		read_sqe->length = cpu_to_le32(send_wr->sg_list[0].length);
+		read_sqe->sink_stag = cpu_to_le32(send_wr->sg_list[0].lkey);
+		read_sqe->sink_to_l =
+			cpu_to_le32(lower_32_bits(send_wr->sg_list[0].addr));
+		read_sqe->sink_to_h =
+			cpu_to_le32(upper_32_bits(send_wr->sg_list[0].addr));
+
+		sge = get_queue_entry(qp->kern_qp.sq_buf, idx + 1,
+				      qp->attrs.sq_size, SQEBB_SHIFT);
+		sge->addr = rdma_wr->remote_addr;
+		sge->lkey = rdma_wr->rkey;
+		sge->length = send_wr->sg_list[0].length;
+		wqe_size = sizeof(struct erdma_readreq_sqe) +
+			   send_wr->num_sge * sizeof(struct ib_sge);
+
+		goto out;
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_SEND_WITH_INV:
+		send_sqe = (struct erdma_send_sqe *)entry;
+		hw_op = ERDMA_OP_SEND;
+		if (op == IB_WR_SEND_WITH_IMM) {
+			hw_op = ERDMA_OP_SEND_WITH_IMM;
+			send_sqe->imm_data = send_wr->ex.imm_data;
+		} else if (op == IB_WR_SEND_WITH_INV) {
+			hw_op = ERDMA_OP_SEND_WITH_INV;
+			send_sqe->invalid_stag =
+				cpu_to_le32(send_wr->ex.invalidate_rkey);
+		}
+		wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op);
+		length_field = &send_sqe->length;
+		wqe_size = sizeof(struct erdma_send_sqe);
+		sgl_offset = wqe_size;
+
+		break;
+	case IB_WR_REG_MR:
+		wqe_hdr |=
+			FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, ERDMA_OP_REG_MR);
+		regmr_sge = (struct erdma_reg_mr_sqe *)entry;
+		mr = to_emr(reg_wr(send_wr)->mr);
+
+		mr->access = ERDMA_MR_ACC_LR |
+			     to_erdma_access_flags(reg_wr(send_wr)->access);
+		regmr_sge->addr = cpu_to_le64(mr->ibmr.iova);
+		regmr_sge->length = cpu_to_le32(mr->ibmr.length);
+		regmr_sge->stag = cpu_to_le32(mr->ibmr.lkey);
+		attrs = FIELD_PREP(ERDMA_SQE_MR_MODE_MASK, 0) |
+			FIELD_PREP(ERDMA_SQE_MR_ACCESS_MASK, mr->access) |
+			FIELD_PREP(ERDMA_SQE_MR_MTT_CNT_MASK,
+				   mr->mem.mtt_nents);
+
+		if (mr->mem.mtt_nents < ERDMA_MAX_INLINE_MTT_ENTRIES) {
+			attrs |= FIELD_PREP(ERDMA_SQE_MR_MTT_TYPE_MASK, 0);
+			/* Copy SGLs to SQE content to accelerate */
+			memcpy(get_queue_entry(qp->kern_qp.sq_buf, idx + 1,
+					       qp->attrs.sq_size, SQEBB_SHIFT),
+			       mr->mem.mtt_buf, MTT_SIZE(mr->mem.mtt_nents));
+			wqe_size = sizeof(struct erdma_reg_mr_sqe) +
+				   MTT_SIZE(mr->mem.mtt_nents);
+		} else {
+			attrs |= FIELD_PREP(ERDMA_SQE_MR_MTT_TYPE_MASK, 1);
+			wqe_size = sizeof(struct erdma_reg_mr_sqe);
+		}
+
+		regmr_sge->attrs = cpu_to_le32(attrs);
+		goto out;
+	case IB_WR_LOCAL_INV:
+		wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK,
+				      ERDMA_OP_LOCAL_INV);
+		regmr_sge = (struct erdma_reg_mr_sqe *)entry;
+		regmr_sge->stag = cpu_to_le32(send_wr->ex.invalidate_rkey);
+		wqe_size = sizeof(struct erdma_reg_mr_sqe);
+		goto out;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	if (flags & IB_SEND_INLINE) {
+		ret = fill_inline_data(qp, send_wr, idx, sgl_offset,
+				       length_field);
+		if (ret < 0)
+			return -EINVAL;
+		wqe_size += ret;
+		wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, ret);
+	} else {
+		ret = fill_sgl(qp, send_wr, idx, sgl_offset, length_field);
+		if (ret)
+			return -EINVAL;
+		wqe_size += send_wr->num_sge * sizeof(struct ib_sge);
+		wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK,
+				      send_wr->num_sge);
+	}
+
+out:
+	wqebb_cnt = SQEBB_COUNT(wqe_size);
+	wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_CNT_MASK, wqebb_cnt - 1);
+	*pi += wqebb_cnt;
+	wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, *pi);
+
+	*entry = wqe_hdr;
+
+	return 0;
+}
+
+static void kick_sq_db(struct erdma_qp *qp, u16 pi)
+{
+	u64 db_data = FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, QP_ID(qp)) |
+		      FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, pi);
+
+	*(u64 *)qp->kern_qp.sq_db_info = db_data;
+	writeq(db_data, qp->kern_qp.hw_sq_db);
+}
+
+int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr,
+		    const struct ib_send_wr **bad_send_wr)
+{
+	struct erdma_qp *qp = to_eqp(ibqp);
+	int ret = 0;
+	const struct ib_send_wr *wr = send_wr;
+	unsigned long flags;
+	u16 sq_pi;
+
+	if (!send_wr)
+		return -EINVAL;
+
+	spin_lock_irqsave(&qp->lock, flags);
+	sq_pi = qp->kern_qp.sq_pi;
+
+	while (wr) {
+		if ((u16)(sq_pi - qp->kern_qp.sq_ci) >= qp->attrs.sq_size) {
+			ret = -ENOMEM;
+			*bad_send_wr = send_wr;
+			break;
+		}
+
+		ret = erdma_push_one_sqe(qp, &sq_pi, wr);
+		if (ret) {
+			*bad_send_wr = wr;
+			break;
+		}
+		qp->kern_qp.sq_pi = sq_pi;
+		kick_sq_db(qp, sq_pi);
+
+		wr = wr->next;
+	}
+	spin_unlock_irqrestore(&qp->lock, flags);
+
+	return ret;
+}
+
+static int erdma_post_recv_one(struct erdma_qp *qp,
+			       const struct ib_recv_wr *recv_wr)
+{
+	struct erdma_rqe *rqe =
+		get_queue_entry(qp->kern_qp.rq_buf, qp->kern_qp.rq_pi,
+				qp->attrs.rq_size, RQE_SHIFT);
+
+	rqe->qe_idx = cpu_to_le16(qp->kern_qp.rq_pi + 1);
+	rqe->qpn = cpu_to_le32(QP_ID(qp));
+
+	if (recv_wr->num_sge == 0) {
+		rqe->length = 0;
+	} else if (recv_wr->num_sge == 1) {
+		rqe->stag = cpu_to_le32(recv_wr->sg_list[0].lkey);
+		rqe->to = cpu_to_le64(recv_wr->sg_list[0].addr);
+		rqe->length = cpu_to_le32(recv_wr->sg_list[0].length);
+	} else {
+		return -EINVAL;
+	}
+
+	*(u64 *)qp->kern_qp.rq_db_info = *(u64 *)rqe;
+	writeq(*(u64 *)rqe, qp->kern_qp.hw_rq_db);
+
+	qp->kern_qp.rwr_tbl[qp->kern_qp.rq_pi & (qp->attrs.rq_size - 1)] =
+		recv_wr->wr_id;
+	qp->kern_qp.rq_pi++;
+
+	return 0;
+}
+
+int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr,
+		    const struct ib_recv_wr **bad_recv_wr)
+{
+	const struct ib_recv_wr *wr = recv_wr;
+	struct erdma_qp *qp = to_eqp(ibqp);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&qp->lock, flags);
+
+	while (wr) {
+		ret = erdma_post_recv_one(qp, wr);
+		if (ret) {
+			*bad_recv_wr = wr;
+			break;
+		}
+		wr = wr->next;
+	}
+
+	spin_unlock_irqrestore(&qp->lock, flags);
+	return ret;
+}
diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c
new file mode 100644
index 000000000000..a7a3d42e2016
--- /dev/null
+++ b/drivers/infiniband/hw/erdma/erdma_verbs.c
@@ -0,0 +1,1460 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
+/*          Kai Shen <kaishen@linux.alibaba.com> */
+/* Copyright (c) 2020-2022, Alibaba Group. */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+/* Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. */
+
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <net/addrconf.h>
+#include <rdma/erdma-abi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/uverbs_ioctl.h>
+
+#include "erdma.h"
+#include "erdma_cm.h"
+#include "erdma_hw.h"
+#include "erdma_verbs.h"
+
+static int create_qp_cmd(struct erdma_dev *dev, struct erdma_qp *qp)
+{
+	struct erdma_cmdq_create_qp_req req;
+	struct erdma_pd *pd = to_epd(qp->ibqp.pd);
+	struct erdma_uqp *user_qp;
+	u64 resp0, resp1;
+	int err;
+
+	erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA,
+				CMDQ_OPCODE_CREATE_QP);
+
+	req.cfg0 = FIELD_PREP(ERDMA_CMD_CREATE_QP_SQ_DEPTH_MASK,
+			      ilog2(qp->attrs.sq_size)) |
+		   FIELD_PREP(ERDMA_CMD_CREATE_QP_QPN_MASK, QP_ID(qp));
+	req.cfg1 = FIELD_PREP(ERDMA_CMD_CREATE_QP_RQ_DEPTH_MASK,
+			      ilog2(qp->attrs.rq_size)) |
+		   FIELD_PREP(ERDMA_CMD_CREATE_QP_PD_MASK, pd->pdn);
+
+	if (rdma_is_kernel_res(&qp->ibqp.res)) {
+		u32 pgsz_range = ilog2(SZ_1M) - PAGE_SHIFT;
+
+		req.sq_cqn_mtt_cfg =
+			FIELD_PREP(ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK,
+				   pgsz_range) |
+			FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->scq->cqn);
+		req.rq_cqn_mtt_cfg =
+			FIELD_PREP(ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK,
+				   pgsz_range) |
+			FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->rcq->cqn);
+
+		req.sq_mtt_cfg =
+			FIELD_PREP(ERDMA_CMD_CREATE_QP_PAGE_OFFSET_MASK, 0) |
+			FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK, 1) |
+			FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
+				   ERDMA_MR_INLINE_MTT);
+		req.rq_mtt_cfg = req.sq_mtt_cfg;
+
+		req.rq_buf_addr = qp->kern_qp.rq_buf_dma_addr;
+		req.sq_buf_addr = qp->kern_qp.sq_buf_dma_addr;
+		req.sq_db_info_dma_addr = qp->kern_qp.sq_buf_dma_addr +
+					  (qp->attrs.sq_size << SQEBB_SHIFT);
+		req.rq_db_info_dma_addr = qp->kern_qp.rq_buf_dma_addr +
+					  (qp->attrs.rq_size << RQE_SHIFT);
+	} else {
+		user_qp = &qp->user_qp;
+		req.sq_cqn_mtt_cfg = FIELD_PREP(
+			ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK,
+			ilog2(user_qp->sq_mtt.page_size) - PAGE_SHIFT);
+		req.sq_cqn_mtt_cfg |=
+			FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->scq->cqn);
+
+		req.rq_cqn_mtt_cfg = FIELD_PREP(
+			ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK,
+			ilog2(user_qp->rq_mtt.page_size) - PAGE_SHIFT);
+		req.rq_cqn_mtt_cfg |=
+			FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->rcq->cqn);
+
+		req.sq_mtt_cfg = user_qp->sq_mtt.page_offset;
+		req.sq_mtt_cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK,
+					     user_qp->sq_mtt.mtt_nents) |
+				  FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
+					     user_qp->sq_mtt.mtt_type);
+
+		req.rq_mtt_cfg = user_qp->rq_mtt.page_offset;
+		req.rq_mtt_cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK,
+					     user_qp->rq_mtt.mtt_nents) |
+				  FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
+					     user_qp->rq_mtt.mtt_type);
+
+		req.sq_buf_addr = user_qp->sq_mtt.mtt_entry[0];
+		req.rq_buf_addr = user_qp->rq_mtt.mtt_entry[0];
+
+		req.sq_db_info_dma_addr = user_qp->sq_db_info_dma_addr;
+		req.rq_db_info_dma_addr = user_qp->rq_db_info_dma_addr;
+	}
+
+	err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), &resp0,
+				  &resp1);
+	if (!err)
+		qp->attrs.cookie =
+			FIELD_GET(ERDMA_CMDQ_CREATE_QP_RESP_COOKIE_MASK, resp0);
+
+	return err;
+}
+
+static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr)
+{
+	struct erdma_cmdq_reg_mr_req req;
+	struct erdma_pd *pd = to_epd(mr->ibmr.pd);
+	u64 *phy_addr;
+	int i;
+
+	erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_REG_MR);
+
+	req.cfg0 = FIELD_PREP(ERDMA_CMD_MR_VALID_MASK, mr->valid) |
+		   FIELD_PREP(ERDMA_CMD_MR_KEY_MASK, mr->ibmr.lkey & 0xFF) |
+		   FIELD_PREP(ERDMA_CMD_MR_MPT_IDX_MASK, mr->ibmr.lkey >> 8);
+	req.cfg1 = FIELD_PREP(ERDMA_CMD_REGMR_PD_MASK, pd->pdn) |
+		   FIELD_PREP(ERDMA_CMD_REGMR_TYPE_MASK, mr->type) |
+		   FIELD_PREP(ERDMA_CMD_REGMR_RIGHT_MASK, mr->access) |
+		   FIELD_PREP(ERDMA_CMD_REGMR_ACC_MODE_MASK, 0);
+	req.cfg2 = FIELD_PREP(ERDMA_CMD_REGMR_PAGESIZE_MASK,
+			      ilog2(mr->mem.page_size)) |
+		   FIELD_PREP(ERDMA_CMD_REGMR_MTT_TYPE_MASK, mr->mem.mtt_type) |
+		   FIELD_PREP(ERDMA_CMD_REGMR_MTT_CNT_MASK, mr->mem.page_cnt);
+
+	if (mr->type == ERDMA_MR_TYPE_DMA)
+		goto post_cmd;
+
+	if (mr->type == ERDMA_MR_TYPE_NORMAL) {
+		req.start_va = mr->mem.va;
+		req.size = mr->mem.len;
+	}
+
+	if (mr->type == ERDMA_MR_TYPE_FRMR ||
+	    mr->mem.mtt_type == ERDMA_MR_INDIRECT_MTT) {
+		phy_addr = req.phy_addr;
+		*phy_addr = mr->mem.mtt_entry[0];
+	} else {
+		phy_addr = req.phy_addr;
+		for (i = 0; i < mr->mem.mtt_nents; i++)
+			*phy_addr++ = mr->mem.mtt_entry[i];
+	}
+
+post_cmd:
+	return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL,
+				   NULL);
+}
+
+static int create_cq_cmd(struct erdma_dev *dev, struct erdma_cq *cq)
+{
+	struct erdma_cmdq_create_cq_req req;
+	u32 page_size;
+	struct erdma_mem *mtt;
+
+	erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA,
+				CMDQ_OPCODE_CREATE_CQ);
+
+	req.cfg0 = FIELD_PREP(ERDMA_CMD_CREATE_CQ_CQN_MASK, cq->cqn) |
+		   FIELD_PREP(ERDMA_CMD_CREATE_CQ_DEPTH_MASK, ilog2(cq->depth));
+	req.cfg1 = FIELD_PREP(ERDMA_CMD_CREATE_CQ_EQN_MASK, cq->assoc_eqn);
+
+	if (rdma_is_kernel_res(&cq->ibcq.res)) {
+		page_size = SZ_32M;
+		req.cfg0 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK,
+				       ilog2(page_size) - PAGE_SHIFT);
+		req.qbuf_addr_l = lower_32_bits(cq->kern_cq.qbuf_dma_addr);
+		req.qbuf_addr_h = upper_32_bits(cq->kern_cq.qbuf_dma_addr);
+
+		req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK, 1) |
+			    FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK,
+				       ERDMA_MR_INLINE_MTT);
+
+		req.first_page_offset = 0;
+		req.cq_db_info_addr =
+			cq->kern_cq.qbuf_dma_addr + (cq->depth << CQE_SHIFT);
+	} else {
+		mtt = &cq->user_cq.qbuf_mtt;
+		req.cfg0 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK,
+				       ilog2(mtt->page_size) - PAGE_SHIFT);
+		if (mtt->mtt_nents == 1) {
+			req.qbuf_addr_l = lower_32_bits(*(u64 *)mtt->mtt_buf);
+			req.qbuf_addr_h = upper_32_bits(*(u64 *)mtt->mtt_buf);
+		} else {
+			req.qbuf_addr_l = lower_32_bits(mtt->mtt_entry[0]);
+			req.qbuf_addr_h = upper_32_bits(mtt->mtt_entry[0]);
+		}
+		req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK,
+				       mtt->mtt_nents);
+		req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK,
+				       mtt->mtt_type);
+
+		req.first_page_offset = mtt->page_offset;
+		req.cq_db_info_addr = cq->user_cq.db_info_dma_addr;
+	}
+
+	return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL,
+				   NULL);
+}
+
+static int erdma_alloc_idx(struct erdma_resource_cb *res_cb)
+{
+	int idx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&res_cb->lock, flags);
+	idx = find_next_zero_bit(res_cb->bitmap, res_cb->max_cap,
+				 res_cb->next_alloc_idx);
+	if (idx == res_cb->max_cap) {
+		idx = find_first_zero_bit(res_cb->bitmap, res_cb->max_cap);
+		if (idx == res_cb->max_cap) {
+			res_cb->next_alloc_idx = 1;
+			spin_unlock_irqrestore(&res_cb->lock, flags);
+			return -ENOSPC;
+		}
+	}
+
+	set_bit(idx, res_cb->bitmap);
+	res_cb->next_alloc_idx = idx + 1;
+	spin_unlock_irqrestore(&res_cb->lock, flags);
+
+	return idx;
+}
+
+static inline void erdma_free_idx(struct erdma_resource_cb *res_cb, u32 idx)
+{
+	unsigned long flags;
+	u32 used;
+
+	spin_lock_irqsave(&res_cb->lock, flags);
+	used = __test_and_clear_bit(idx, res_cb->bitmap);
+	spin_unlock_irqrestore(&res_cb->lock, flags);
+	WARN_ON(!used);
+}
+
+static struct rdma_user_mmap_entry *
+erdma_user_mmap_entry_insert(struct erdma_ucontext *uctx, void *address,
+			     u32 size, u8 mmap_flag, u64 *mmap_offset)
+{
+	struct erdma_user_mmap_entry *entry =
+		kzalloc(sizeof(*entry), GFP_KERNEL);
+	int ret;
+
+	if (!entry)
+		return NULL;
+
+	entry->address = (u64)address;
+	entry->mmap_flag = mmap_flag;
+
+	size = PAGE_ALIGN(size);
+
+	ret = rdma_user_mmap_entry_insert(&uctx->ibucontext, &entry->rdma_entry,
+					  size);
+	if (ret) {
+		kfree(entry);
+		return NULL;
+	}
+
+	*mmap_offset = rdma_user_mmap_get_offset(&entry->rdma_entry);
+
+	return &entry->rdma_entry;
+}
+
+int erdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr,
+		       struct ib_udata *unused)
+{
+	struct erdma_dev *dev = to_edev(ibdev);
+
+	memset(attr, 0, sizeof(*attr));
+
+	attr->max_mr_size = dev->attrs.max_mr_size;
+	attr->vendor_id = PCI_VENDOR_ID_ALIBABA;
+	attr->vendor_part_id = dev->pdev->device;
+	attr->hw_ver = dev->pdev->revision;
+	attr->max_qp = dev->attrs.max_qp;
+	attr->max_qp_wr = min(dev->attrs.max_send_wr, dev->attrs.max_recv_wr);
+	attr->max_qp_rd_atom = dev->attrs.max_ord;
+	attr->max_qp_init_rd_atom = dev->attrs.max_ird;
+	attr->max_res_rd_atom = dev->attrs.max_qp * dev->attrs.max_ird;
+	attr->device_cap_flags = IB_DEVICE_MEM_MGT_EXTENSIONS;
+	attr->kernel_cap_flags = IBK_LOCAL_DMA_LKEY;
+	ibdev->local_dma_lkey = dev->attrs.local_dma_key;
+	attr->max_send_sge = dev->attrs.max_send_sge;
+	attr->max_recv_sge = dev->attrs.max_recv_sge;
+	attr->max_sge_rd = dev->attrs.max_sge_rd;
+	attr->max_cq = dev->attrs.max_cq;
+	attr->max_cqe = dev->attrs.max_cqe;
+	attr->max_mr = dev->attrs.max_mr;
+	attr->max_pd = dev->attrs.max_pd;
+	attr->max_mw = dev->attrs.max_mw;
+	attr->max_fast_reg_page_list_len = ERDMA_MAX_FRMR_PA;
+	attr->page_size_cap = ERDMA_PAGE_SIZE_SUPPORT;
+	attr->fw_ver = dev->attrs.fw_version;
+
+	if (dev->netdev)
+		addrconf_addr_eui48((u8 *)&attr->sys_image_guid,
+				    dev->netdev->dev_addr);
+
+	return 0;
+}
+
+int erdma_query_gid(struct ib_device *ibdev, u32 port, int idx,
+		    union ib_gid *gid)
+{
+	struct erdma_dev *dev = to_edev(ibdev);
+
+	memset(gid, 0, sizeof(*gid));
+	ether_addr_copy(gid->raw, dev->attrs.peer_addr);
+
+	return 0;
+}
+
+int erdma_query_port(struct ib_device *ibdev, u32 port,
+		     struct ib_port_attr *attr)
+{
+	struct erdma_dev *dev = to_edev(ibdev);
+	struct net_device *ndev = dev->netdev;
+
+	memset(attr, 0, sizeof(*attr));
+
+	attr->gid_tbl_len = 1;
+	attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP;
+	attr->max_msg_sz = -1;
+
+	if (!ndev)
+		goto out;
+
+	ib_get_eth_speed(ibdev, port, &attr->active_speed, &attr->active_width);
+	attr->max_mtu = ib_mtu_int_to_enum(ndev->mtu);
+	attr->active_mtu = ib_mtu_int_to_enum(ndev->mtu);
+	if (netif_running(ndev) && netif_carrier_ok(ndev))
+		dev->state = IB_PORT_ACTIVE;
+	else
+		dev->state = IB_PORT_DOWN;
+	attr->state = dev->state;
+
+out:
+	if (dev->state == IB_PORT_ACTIVE)
+		attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
+	else
+		attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
+
+	return 0;
+}
+
+int erdma_get_port_immutable(struct ib_device *ibdev, u32 port,
+			     struct ib_port_immutable *port_immutable)
+{
+	port_immutable->gid_tbl_len = 1;
+	port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
+
+	return 0;
+}
+
+int erdma_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+{
+	struct erdma_pd *pd = to_epd(ibpd);
+	struct erdma_dev *dev = to_edev(ibpd->device);
+	int pdn;
+
+	pdn = erdma_alloc_idx(&dev->res_cb[ERDMA_RES_TYPE_PD]);
+	if (pdn < 0)
+		return pdn;
+
+	pd->pdn = pdn;
+
+	return 0;
+}
+
+int erdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+{
+	struct erdma_pd *pd = to_epd(ibpd);
+	struct erdma_dev *dev = to_edev(ibpd->device);
+
+	erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_PD], pd->pdn);
+
+	return 0;
+}
+
+static int erdma_qp_validate_cap(struct erdma_dev *dev,
+				 struct ib_qp_init_attr *attrs)
+{
+	if ((attrs->cap.max_send_wr > dev->attrs.max_send_wr) ||
+	    (attrs->cap.max_recv_wr > dev->attrs.max_recv_wr) ||
+	    (attrs->cap.max_send_sge > dev->attrs.max_send_sge) ||
+	    (attrs->cap.max_recv_sge > dev->attrs.max_recv_sge) ||
+	    (attrs->cap.max_inline_data > ERDMA_MAX_INLINE) ||
+	    !attrs->cap.max_send_wr || !attrs->cap.max_recv_wr) {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int erdma_qp_validate_attr(struct erdma_dev *dev,
+				  struct ib_qp_init_attr *attrs)
+{
+	if (attrs->qp_type != IB_QPT_RC)
+		return -EOPNOTSUPP;
+
+	if (attrs->srq)
+		return -EOPNOTSUPP;
+
+	if (!attrs->send_cq || !attrs->recv_cq)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static void free_kernel_qp(struct erdma_qp *qp)
+{
+	struct erdma_dev *dev = qp->dev;
+
+	vfree(qp->kern_qp.swr_tbl);
+	vfree(qp->kern_qp.rwr_tbl);
+
+	if (qp->kern_qp.sq_buf)
+		dma_free_coherent(
+			&dev->pdev->dev,
+			WARPPED_BUFSIZE(qp->attrs.sq_size << SQEBB_SHIFT),
+			qp->kern_qp.sq_buf, qp->kern_qp.sq_buf_dma_addr);
+
+	if (qp->kern_qp.rq_buf)
+		dma_free_coherent(
+			&dev->pdev->dev,
+			WARPPED_BUFSIZE(qp->attrs.rq_size << RQE_SHIFT),
+			qp->kern_qp.rq_buf, qp->kern_qp.rq_buf_dma_addr);
+}
+
+static int init_kernel_qp(struct erdma_dev *dev, struct erdma_qp *qp,
+			  struct ib_qp_init_attr *attrs)
+{
+	struct erdma_kqp *kqp = &qp->kern_qp;
+	int size;
+
+	if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR)
+		kqp->sig_all = 1;
+
+	kqp->sq_pi = 0;
+	kqp->sq_ci = 0;
+	kqp->rq_pi = 0;
+	kqp->rq_ci = 0;
+	kqp->hw_sq_db =
+		dev->func_bar + (ERDMA_SDB_SHARED_PAGE_INDEX << PAGE_SHIFT);
+	kqp->hw_rq_db = dev->func_bar + ERDMA_BAR_RQDB_SPACE_OFFSET;
+
+	kqp->swr_tbl = vmalloc(qp->attrs.sq_size * sizeof(u64));
+	kqp->rwr_tbl = vmalloc(qp->attrs.rq_size * sizeof(u64));
+	if (!kqp->swr_tbl || !kqp->rwr_tbl)
+		goto err_out;
+
+	size = (qp->attrs.sq_size << SQEBB_SHIFT) + ERDMA_EXTRA_BUFFER_SIZE;
+	kqp->sq_buf = dma_alloc_coherent(&dev->pdev->dev, size,
+					 &kqp->sq_buf_dma_addr, GFP_KERNEL);
+	if (!kqp->sq_buf)
+		goto err_out;
+
+	size = (qp->attrs.rq_size << RQE_SHIFT) + ERDMA_EXTRA_BUFFER_SIZE;
+	kqp->rq_buf = dma_alloc_coherent(&dev->pdev->dev, size,
+					 &kqp->rq_buf_dma_addr, GFP_KERNEL);
+	if (!kqp->rq_buf)
+		goto err_out;
+
+	kqp->sq_db_info = kqp->sq_buf + (qp->attrs.sq_size << SQEBB_SHIFT);
+	kqp->rq_db_info = kqp->rq_buf + (qp->attrs.rq_size << RQE_SHIFT);
+
+	return 0;
+
+err_out:
+	free_kernel_qp(qp);
+	return -ENOMEM;
+}
+
+static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem,
+			   u64 start, u64 len, int access, u64 virt,
+			   unsigned long req_page_size, u8 force_indirect_mtt)
+{
+	struct ib_block_iter biter;
+	uint64_t *phy_addr = NULL;
+	int ret = 0;
+
+	mem->umem = ib_umem_get(&dev->ibdev, start, len, access);
+	if (IS_ERR(mem->umem)) {
+		ret = PTR_ERR(mem->umem);
+		mem->umem = NULL;
+		return ret;
+	}
+
+	mem->va = virt;
+	mem->len = len;
+	mem->page_size = ib_umem_find_best_pgsz(mem->umem, req_page_size, virt);
+	mem->page_offset = start & (mem->page_size - 1);
+	mem->mtt_nents = ib_umem_num_dma_blocks(mem->umem, mem->page_size);
+	mem->page_cnt = mem->mtt_nents;
+
+	if (mem->page_cnt > ERDMA_MAX_INLINE_MTT_ENTRIES ||
+	    force_indirect_mtt) {
+		mem->mtt_type = ERDMA_MR_INDIRECT_MTT;
+		mem->mtt_buf =
+			alloc_pages_exact(MTT_SIZE(mem->page_cnt), GFP_KERNEL);
+		if (!mem->mtt_buf) {
+			ret = -ENOMEM;
+			goto error_ret;
+		}
+		phy_addr = mem->mtt_buf;
+	} else {
+		mem->mtt_type = ERDMA_MR_INLINE_MTT;
+		phy_addr = mem->mtt_entry;
+	}
+
+	rdma_umem_for_each_dma_block(mem->umem, &biter, mem->page_size) {
+		*phy_addr = rdma_block_iter_dma_address(&biter);
+		phy_addr++;
+	}
+
+	if (mem->mtt_type == ERDMA_MR_INDIRECT_MTT) {
+		mem->mtt_entry[0] =
+			dma_map_single(&dev->pdev->dev, mem->mtt_buf,
+				       MTT_SIZE(mem->page_cnt), DMA_TO_DEVICE);
+		if (dma_mapping_error(&dev->pdev->dev, mem->mtt_entry[0])) {
+			free_pages_exact(mem->mtt_buf, MTT_SIZE(mem->page_cnt));
+			mem->mtt_buf = NULL;
+			ret = -ENOMEM;
+			goto error_ret;
+		}
+	}
+
+	return 0;
+
+error_ret:
+	if (mem->umem) {
+		ib_umem_release(mem->umem);
+		mem->umem = NULL;
+	}
+
+	return ret;
+}
+
+static void put_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem)
+{
+	if (mem->mtt_buf) {
+		dma_unmap_single(&dev->pdev->dev, mem->mtt_entry[0],
+				 MTT_SIZE(mem->page_cnt), DMA_TO_DEVICE);
+		free_pages_exact(mem->mtt_buf, MTT_SIZE(mem->page_cnt));
+	}
+
+	if (mem->umem) {
+		ib_umem_release(mem->umem);
+		mem->umem = NULL;
+	}
+}
+
+static int erdma_map_user_dbrecords(struct erdma_ucontext *ctx,
+				    u64 dbrecords_va,
+				    struct erdma_user_dbrecords_page **dbr_page,
+				    dma_addr_t *dma_addr)
+{
+	struct erdma_user_dbrecords_page *page = NULL;
+	int rv = 0;
+
+	mutex_lock(&ctx->dbrecords_page_mutex);
+
+	list_for_each_entry(page, &ctx->dbrecords_page_list, list)
+		if (page->va == (dbrecords_va & PAGE_MASK))
+			goto found;
+
+	page = kmalloc(sizeof(*page), GFP_KERNEL);
+	if (!page) {
+		rv = -ENOMEM;
+		goto out;
+	}
+
+	page->va = (dbrecords_va & PAGE_MASK);
+	page->refcnt = 0;
+
+	page->umem = ib_umem_get(ctx->ibucontext.device,
+				 dbrecords_va & PAGE_MASK, PAGE_SIZE, 0);
+	if (IS_ERR(page->umem)) {
+		rv = PTR_ERR(page->umem);
+		kfree(page);
+		goto out;
+	}
+
+	list_add(&page->list, &ctx->dbrecords_page_list);
+
+found:
+	*dma_addr = sg_dma_address(page->umem->sgt_append.sgt.sgl) +
+		    (dbrecords_va & ~PAGE_MASK);
+	*dbr_page = page;
+	page->refcnt++;
+
+out:
+	mutex_unlock(&ctx->dbrecords_page_mutex);
+	return rv;
+}
+
+static void
+erdma_unmap_user_dbrecords(struct erdma_ucontext *ctx,
+			   struct erdma_user_dbrecords_page **dbr_page)
+{
+	if (!ctx || !(*dbr_page))
+		return;
+
+	mutex_lock(&ctx->dbrecords_page_mutex);
+	if (--(*dbr_page)->refcnt == 0) {
+		list_del(&(*dbr_page)->list);
+		ib_umem_release((*dbr_page)->umem);
+		kfree(*dbr_page);
+	}
+
+	*dbr_page = NULL;
+	mutex_unlock(&ctx->dbrecords_page_mutex);
+}
+
+static int init_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx,
+			u64 va, u32 len, u64 db_info_va)
+{
+	dma_addr_t db_info_dma_addr;
+	u32 rq_offset;
+	int ret;
+
+	if (len < (PAGE_ALIGN(qp->attrs.sq_size * SQEBB_SIZE) +
+		   qp->attrs.rq_size * RQE_SIZE))
+		return -EINVAL;
+
+	ret = get_mtt_entries(qp->dev, &qp->user_qp.sq_mtt, va,
+			      qp->attrs.sq_size << SQEBB_SHIFT, 0, va,
+			      (SZ_1M - SZ_4K), 1);
+	if (ret)
+		return ret;
+
+	rq_offset = PAGE_ALIGN(qp->attrs.sq_size << SQEBB_SHIFT);
+	qp->user_qp.rq_offset = rq_offset;
+
+	ret = get_mtt_entries(qp->dev, &qp->user_qp.rq_mtt, va + rq_offset,
+			      qp->attrs.rq_size << RQE_SHIFT, 0, va + rq_offset,
+			      (SZ_1M - SZ_4K), 1);
+	if (ret)
+		goto put_sq_mtt;
+
+	ret = erdma_map_user_dbrecords(uctx, db_info_va,
+				       &qp->user_qp.user_dbr_page,
+				       &db_info_dma_addr);
+	if (ret)
+		goto put_rq_mtt;
+
+	qp->user_qp.sq_db_info_dma_addr = db_info_dma_addr;
+	qp->user_qp.rq_db_info_dma_addr = db_info_dma_addr + ERDMA_DB_SIZE;
+
+	return 0;
+
+put_rq_mtt:
+	put_mtt_entries(qp->dev, &qp->user_qp.rq_mtt);
+
+put_sq_mtt:
+	put_mtt_entries(qp->dev, &qp->user_qp.sq_mtt);
+
+	return ret;
+}
+
+static void free_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx)
+{
+	put_mtt_entries(qp->dev, &qp->user_qp.sq_mtt);
+	put_mtt_entries(qp->dev, &qp->user_qp.rq_mtt);
+	erdma_unmap_user_dbrecords(uctx, &qp->user_qp.user_dbr_page);
+}
+
+int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs,
+		    struct ib_udata *udata)
+{
+	struct erdma_qp *qp = to_eqp(ibqp);
+	struct erdma_dev *dev = to_edev(ibqp->device);
+	struct erdma_ucontext *uctx = rdma_udata_to_drv_context(
+		udata, struct erdma_ucontext, ibucontext);
+	struct erdma_ureq_create_qp ureq;
+	struct erdma_uresp_create_qp uresp;
+	int ret;
+
+	ret = erdma_qp_validate_cap(dev, attrs);
+	if (ret)
+		goto err_out;
+
+	ret = erdma_qp_validate_attr(dev, attrs);
+	if (ret)
+		goto err_out;
+
+	qp->scq = to_ecq(attrs->send_cq);
+	qp->rcq = to_ecq(attrs->recv_cq);
+	qp->dev = dev;
+	qp->attrs.cc = dev->attrs.cc;
+
+	init_rwsem(&qp->state_lock);
+	kref_init(&qp->ref);
+	init_completion(&qp->safe_free);
+
+	ret = xa_alloc_cyclic(&dev->qp_xa, &qp->ibqp.qp_num, qp,
+			      XA_LIMIT(1, dev->attrs.max_qp - 1),
+			      &dev->next_alloc_qpn, GFP_KERNEL);
+	if (ret < 0) {
+		ret = -ENOMEM;
+		goto err_out;
+	}
+
+	qp->attrs.sq_size = roundup_pow_of_two(attrs->cap.max_send_wr *
+					       ERDMA_MAX_WQEBB_PER_SQE);
+	qp->attrs.rq_size = roundup_pow_of_two(attrs->cap.max_recv_wr);
+
+	if (uctx) {
+		ret = ib_copy_from_udata(&ureq, udata,
+					 min(sizeof(ureq), udata->inlen));
+		if (ret)
+			goto err_out_xa;
+
+		ret = init_user_qp(qp, uctx, ureq.qbuf_va, ureq.qbuf_len,
+				   ureq.db_record_va);
+		if (ret)
+			goto err_out_xa;
+
+		memset(&uresp, 0, sizeof(uresp));
+
+		uresp.num_sqe = qp->attrs.sq_size;
+		uresp.num_rqe = qp->attrs.rq_size;
+		uresp.qp_id = QP_ID(qp);
+		uresp.rq_offset = qp->user_qp.rq_offset;
+
+		ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+		if (ret)
+			goto err_out_cmd;
+	} else {
+		init_kernel_qp(dev, qp, attrs);
+	}
+
+	qp->attrs.max_send_sge = attrs->cap.max_send_sge;
+	qp->attrs.max_recv_sge = attrs->cap.max_recv_sge;
+	qp->attrs.state = ERDMA_QP_STATE_IDLE;
+
+	ret = create_qp_cmd(dev, qp);
+	if (ret)
+		goto err_out_cmd;
+
+	spin_lock_init(&qp->lock);
+
+	return 0;
+
+err_out_cmd:
+	if (uctx)
+		free_user_qp(qp, uctx);
+	else
+		free_kernel_qp(qp);
+err_out_xa:
+	xa_erase(&dev->qp_xa, QP_ID(qp));
+err_out:
+	return ret;
+}
+
+static int erdma_create_stag(struct erdma_dev *dev, u32 *stag)
+{
+	int stag_idx;
+
+	stag_idx = erdma_alloc_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX]);
+	if (stag_idx < 0)
+		return stag_idx;
+
+	/* For now, we always let key field be zero. */
+	*stag = (stag_idx << 8);
+
+	return 0;
+}
+
+struct ib_mr *erdma_get_dma_mr(struct ib_pd *ibpd, int acc)
+{
+	struct erdma_dev *dev = to_edev(ibpd->device);
+	struct erdma_mr *mr;
+	u32 stag;
+	int ret;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	ret = erdma_create_stag(dev, &stag);
+	if (ret)
+		goto out_free;
+
+	mr->type = ERDMA_MR_TYPE_DMA;
+
+	mr->ibmr.lkey = stag;
+	mr->ibmr.rkey = stag;
+	mr->ibmr.pd = ibpd;
+	mr->access = ERDMA_MR_ACC_LR | to_erdma_access_flags(acc);
+	ret = regmr_cmd(dev, mr);
+	if (ret)
+		goto out_remove_stag;
+
+	return &mr->ibmr;
+
+out_remove_stag:
+	erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX],
+		       mr->ibmr.lkey >> 8);
+
+out_free:
+	kfree(mr);
+
+	return ERR_PTR(ret);
+}
+
+struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
+				u32 max_num_sg)
+{
+	struct erdma_mr *mr;
+	struct erdma_dev *dev = to_edev(ibpd->device);
+	int ret;
+	u32 stag;
+
+	if (mr_type != IB_MR_TYPE_MEM_REG)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (max_num_sg > ERDMA_MR_MAX_MTT_CNT)
+		return ERR_PTR(-EINVAL);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	ret = erdma_create_stag(dev, &stag);
+	if (ret)
+		goto out_free;
+
+	mr->type = ERDMA_MR_TYPE_FRMR;
+
+	mr->ibmr.lkey = stag;
+	mr->ibmr.rkey = stag;
+	mr->ibmr.pd = ibpd;
+	/* update it in FRMR. */
+	mr->access = ERDMA_MR_ACC_LR | ERDMA_MR_ACC_LW | ERDMA_MR_ACC_RR |
+		     ERDMA_MR_ACC_RW;
+
+	mr->mem.page_size = PAGE_SIZE; /* update it later. */
+	mr->mem.page_cnt = max_num_sg;
+	mr->mem.mtt_type = ERDMA_MR_INDIRECT_MTT;
+	mr->mem.mtt_buf =
+		alloc_pages_exact(MTT_SIZE(mr->mem.page_cnt), GFP_KERNEL);
+	if (!mr->mem.mtt_buf) {
+		ret = -ENOMEM;
+		goto out_remove_stag;
+	}
+
+	mr->mem.mtt_entry[0] =
+		dma_map_single(&dev->pdev->dev, mr->mem.mtt_buf,
+			       MTT_SIZE(mr->mem.page_cnt), DMA_TO_DEVICE);
+	if (dma_mapping_error(&dev->pdev->dev, mr->mem.mtt_entry[0])) {
+		ret = -ENOMEM;
+		goto out_free_mtt;
+	}
+
+	ret = regmr_cmd(dev, mr);
+	if (ret)
+		goto out_dma_unmap;
+
+	return &mr->ibmr;
+
+out_dma_unmap:
+	dma_unmap_single(&dev->pdev->dev, mr->mem.mtt_entry[0],
+			 MTT_SIZE(mr->mem.page_cnt), DMA_TO_DEVICE);
+out_free_mtt:
+	free_pages_exact(mr->mem.mtt_buf, MTT_SIZE(mr->mem.page_cnt));
+
+out_remove_stag:
+	erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX],
+		       mr->ibmr.lkey >> 8);
+
+out_free:
+	kfree(mr);
+
+	return ERR_PTR(ret);
+}
+
+static int erdma_set_page(struct ib_mr *ibmr, u64 addr)
+{
+	struct erdma_mr *mr = to_emr(ibmr);
+
+	if (mr->mem.mtt_nents >= mr->mem.page_cnt)
+		return -1;
+
+	*((u64 *)mr->mem.mtt_buf + mr->mem.mtt_nents) = addr;
+	mr->mem.mtt_nents++;
+
+	return 0;
+}
+
+int erdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+		    unsigned int *sg_offset)
+{
+	struct erdma_mr *mr = to_emr(ibmr);
+	int num;
+
+	mr->mem.mtt_nents = 0;
+
+	num = ib_sg_to_pages(&mr->ibmr, sg, sg_nents, sg_offset,
+			     erdma_set_page);
+
+	return num;
+}
+
+struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
+				u64 virt, int access, struct ib_udata *udata)
+{
+	struct erdma_mr *mr = NULL;
+	struct erdma_dev *dev = to_edev(ibpd->device);
+	u32 stag;
+	int ret;
+
+	if (!len || len > dev->attrs.max_mr_size)
+		return ERR_PTR(-EINVAL);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	ret = get_mtt_entries(dev, &mr->mem, start, len, access, virt,
+			      SZ_2G - SZ_4K, 0);
+	if (ret)
+		goto err_out_free;
+
+	ret = erdma_create_stag(dev, &stag);
+	if (ret)
+		goto err_out_put_mtt;
+
+	mr->ibmr.lkey = mr->ibmr.rkey = stag;
+	mr->ibmr.pd = ibpd;
+	mr->mem.va = virt;
+	mr->mem.len = len;
+	mr->access = ERDMA_MR_ACC_LR | to_erdma_access_flags(access);
+	mr->valid = 1;
+	mr->type = ERDMA_MR_TYPE_NORMAL;
+
+	ret = regmr_cmd(dev, mr);
+	if (ret)
+		goto err_out_mr;
+
+	return &mr->ibmr;
+
+err_out_mr:
+	erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX],
+		       mr->ibmr.lkey >> 8);
+
+err_out_put_mtt:
+	put_mtt_entries(dev, &mr->mem);
+
+err_out_free:
+	kfree(mr);
+
+	return ERR_PTR(ret);
+}
+
+int erdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+{
+	struct erdma_mr *mr;
+	struct erdma_dev *dev = to_edev(ibmr->device);
+	struct erdma_cmdq_dereg_mr_req req;
+	int ret;
+
+	mr = to_emr(ibmr);
+
+	erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA,
+				CMDQ_OPCODE_DEREG_MR);
+
+	req.cfg = FIELD_PREP(ERDMA_CMD_MR_MPT_IDX_MASK, ibmr->lkey >> 8) |
+		  FIELD_PREP(ERDMA_CMD_MR_KEY_MASK, ibmr->lkey & 0xFF);
+
+	ret = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL,
+				  NULL);
+	if (ret)
+		return ret;
+
+	erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX], ibmr->lkey >> 8);
+
+	put_mtt_entries(dev, &mr->mem);
+
+	kfree(mr);
+	return 0;
+}
+
+int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+{
+	struct erdma_cq *cq = to_ecq(ibcq);
+	struct erdma_dev *dev = to_edev(ibcq->device);
+	struct erdma_ucontext *ctx = rdma_udata_to_drv_context(
+		udata, struct erdma_ucontext, ibucontext);
+	int err;
+	struct erdma_cmdq_destroy_cq_req req;
+
+	erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA,
+				CMDQ_OPCODE_DESTROY_CQ);
+	req.cqn = cq->cqn;
+
+	err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL,
+				  NULL);
+	if (err)
+		return err;
+
+	if (rdma_is_kernel_res(&cq->ibcq.res)) {
+		dma_free_coherent(&dev->pdev->dev,
+				  WARPPED_BUFSIZE(cq->depth << CQE_SHIFT),
+				  cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr);
+	} else {
+		erdma_unmap_user_dbrecords(ctx, &cq->user_cq.user_dbr_page);
+		put_mtt_entries(dev, &cq->user_cq.qbuf_mtt);
+	}
+
+	xa_erase(&dev->cq_xa, cq->cqn);
+
+	return 0;
+}
+
+int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
+{
+	struct erdma_qp *qp = to_eqp(ibqp);
+	struct erdma_dev *dev = to_edev(ibqp->device);
+	struct erdma_ucontext *ctx = rdma_udata_to_drv_context(
+		udata, struct erdma_ucontext, ibucontext);
+	struct erdma_qp_attrs qp_attrs;
+	int err;
+	struct erdma_cmdq_destroy_qp_req req;
+
+	down_write(&qp->state_lock);
+	qp_attrs.state = ERDMA_QP_STATE_ERROR;
+	erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE);
+	up_write(&qp->state_lock);
+
+	erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA,
+				CMDQ_OPCODE_DESTROY_QP);
+	req.qpn = QP_ID(qp);
+
+	err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL,
+				  NULL);
+	if (err)
+		return err;
+
+	erdma_qp_put(qp);
+	wait_for_completion(&qp->safe_free);
+
+	if (rdma_is_kernel_res(&qp->ibqp.res)) {
+		vfree(qp->kern_qp.swr_tbl);
+		vfree(qp->kern_qp.rwr_tbl);
+		dma_free_coherent(
+			&dev->pdev->dev,
+			WARPPED_BUFSIZE(qp->attrs.rq_size << RQE_SHIFT),
+			qp->kern_qp.rq_buf, qp->kern_qp.rq_buf_dma_addr);
+		dma_free_coherent(
+			&dev->pdev->dev,
+			WARPPED_BUFSIZE(qp->attrs.sq_size << SQEBB_SHIFT),
+			qp->kern_qp.sq_buf, qp->kern_qp.sq_buf_dma_addr);
+	} else {
+		put_mtt_entries(dev, &qp->user_qp.sq_mtt);
+		put_mtt_entries(dev, &qp->user_qp.rq_mtt);
+		erdma_unmap_user_dbrecords(ctx, &qp->user_qp.user_dbr_page);
+	}
+
+	if (qp->cep)
+		erdma_cep_put(qp->cep);
+	xa_erase(&dev->qp_xa, QP_ID(qp));
+
+	return 0;
+}
+
+void erdma_qp_get_ref(struct ib_qp *ibqp)
+{
+	erdma_qp_get(to_eqp(ibqp));
+}
+
+void erdma_qp_put_ref(struct ib_qp *ibqp)
+{
+	erdma_qp_put(to_eqp(ibqp));
+}
+
+int erdma_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma)
+{
+	struct rdma_user_mmap_entry *rdma_entry;
+	struct erdma_user_mmap_entry *entry;
+	pgprot_t prot;
+	int err;
+
+	rdma_entry = rdma_user_mmap_entry_get(ctx, vma);
+	if (!rdma_entry)
+		return -EINVAL;
+
+	entry = to_emmap(rdma_entry);
+
+	switch (entry->mmap_flag) {
+	case ERDMA_MMAP_IO_NC:
+		/* map doorbell. */
+		prot = pgprot_device(vma->vm_page_prot);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	err = rdma_user_mmap_io(ctx, vma, PFN_DOWN(entry->address), PAGE_SIZE,
+				prot, rdma_entry);
+
+	rdma_user_mmap_entry_put(rdma_entry);
+	return err;
+}
+
+void erdma_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
+{
+	struct erdma_user_mmap_entry *entry = to_emmap(rdma_entry);
+
+	kfree(entry);
+}
+
+#define ERDMA_SDB_PAGE 0
+#define ERDMA_SDB_ENTRY 1
+#define ERDMA_SDB_SHARED 2
+
+static void alloc_db_resources(struct erdma_dev *dev,
+			       struct erdma_ucontext *ctx)
+{
+	u32 bitmap_idx;
+	struct erdma_devattr *attrs = &dev->attrs;
+
+	if (attrs->disable_dwqe)
+		goto alloc_normal_db;
+
+	/* Try to alloc independent SDB page. */
+	spin_lock(&dev->db_bitmap_lock);
+	bitmap_idx = find_first_zero_bit(dev->sdb_page, attrs->dwqe_pages);
+	if (bitmap_idx != attrs->dwqe_pages) {
+		set_bit(bitmap_idx, dev->sdb_page);
+		spin_unlock(&dev->db_bitmap_lock);
+
+		ctx->sdb_type = ERDMA_SDB_PAGE;
+		ctx->sdb_idx = bitmap_idx;
+		ctx->sdb_page_idx = bitmap_idx;
+		ctx->sdb = dev->func_bar_addr + ERDMA_BAR_SQDB_SPACE_OFFSET +
+			   (bitmap_idx << PAGE_SHIFT);
+		ctx->sdb_page_off = 0;
+
+		return;
+	}
+
+	bitmap_idx = find_first_zero_bit(dev->sdb_entry, attrs->dwqe_entries);
+	if (bitmap_idx != attrs->dwqe_entries) {
+		set_bit(bitmap_idx, dev->sdb_entry);
+		spin_unlock(&dev->db_bitmap_lock);
+
+		ctx->sdb_type = ERDMA_SDB_ENTRY;
+		ctx->sdb_idx = bitmap_idx;
+		ctx->sdb_page_idx = attrs->dwqe_pages +
+				    bitmap_idx / ERDMA_DWQE_TYPE1_CNT_PER_PAGE;
+		ctx->sdb_page_off = bitmap_idx % ERDMA_DWQE_TYPE1_CNT_PER_PAGE;
+
+		ctx->sdb = dev->func_bar_addr + ERDMA_BAR_SQDB_SPACE_OFFSET +
+			   (ctx->sdb_page_idx << PAGE_SHIFT);
+
+		return;
+	}
+
+	spin_unlock(&dev->db_bitmap_lock);
+
+alloc_normal_db:
+	ctx->sdb_type = ERDMA_SDB_SHARED;
+	ctx->sdb_idx = 0;
+	ctx->sdb_page_idx = ERDMA_SDB_SHARED_PAGE_INDEX;
+	ctx->sdb_page_off = 0;
+
+	ctx->sdb = dev->func_bar_addr + (ctx->sdb_page_idx << PAGE_SHIFT);
+}
+
+static void erdma_uctx_user_mmap_entries_remove(struct erdma_ucontext *uctx)
+{
+	rdma_user_mmap_entry_remove(uctx->sq_db_mmap_entry);
+	rdma_user_mmap_entry_remove(uctx->rq_db_mmap_entry);
+	rdma_user_mmap_entry_remove(uctx->cq_db_mmap_entry);
+}
+
+int erdma_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *udata)
+{
+	struct erdma_ucontext *ctx = to_ectx(ibctx);
+	struct erdma_dev *dev = to_edev(ibctx->device);
+	int ret;
+	struct erdma_uresp_alloc_ctx uresp = {};
+
+	if (atomic_inc_return(&dev->num_ctx) > ERDMA_MAX_CONTEXT) {
+		ret = -ENOMEM;
+		goto err_out;
+	}
+
+	INIT_LIST_HEAD(&ctx->dbrecords_page_list);
+	mutex_init(&ctx->dbrecords_page_mutex);
+
+	alloc_db_resources(dev, ctx);
+
+	ctx->rdb = dev->func_bar_addr + ERDMA_BAR_RQDB_SPACE_OFFSET;
+	ctx->cdb = dev->func_bar_addr + ERDMA_BAR_CQDB_SPACE_OFFSET;
+
+	if (udata->outlen < sizeof(uresp)) {
+		ret = -EINVAL;
+		goto err_out;
+	}
+
+	ctx->sq_db_mmap_entry = erdma_user_mmap_entry_insert(
+		ctx, (void *)ctx->sdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.sdb);
+	if (!ctx->sq_db_mmap_entry) {
+		ret = -ENOMEM;
+		goto err_out;
+	}
+
+	ctx->rq_db_mmap_entry = erdma_user_mmap_entry_insert(
+		ctx, (void *)ctx->rdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.rdb);
+	if (!ctx->rq_db_mmap_entry) {
+		ret = -EINVAL;
+		goto err_out;
+	}
+
+	ctx->cq_db_mmap_entry = erdma_user_mmap_entry_insert(
+		ctx, (void *)ctx->cdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.cdb);
+	if (!ctx->cq_db_mmap_entry) {
+		ret = -EINVAL;
+		goto err_out;
+	}
+
+	uresp.dev_id = dev->pdev->device;
+	uresp.sdb_type = ctx->sdb_type;
+	uresp.sdb_offset = ctx->sdb_page_off;
+
+	ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+	if (ret)
+		goto err_out;
+
+	return 0;
+
+err_out:
+	erdma_uctx_user_mmap_entries_remove(ctx);
+	atomic_dec(&dev->num_ctx);
+	return ret;
+}
+
+void erdma_dealloc_ucontext(struct ib_ucontext *ibctx)
+{
+	struct erdma_ucontext *ctx = to_ectx(ibctx);
+	struct erdma_dev *dev = to_edev(ibctx->device);
+
+	spin_lock(&dev->db_bitmap_lock);
+	if (ctx->sdb_type == ERDMA_SDB_PAGE)
+		clear_bit(ctx->sdb_idx, dev->sdb_page);
+	else if (ctx->sdb_type == ERDMA_SDB_ENTRY)
+		clear_bit(ctx->sdb_idx, dev->sdb_entry);
+
+	erdma_uctx_user_mmap_entries_remove(ctx);
+
+	spin_unlock(&dev->db_bitmap_lock);
+
+	atomic_dec(&dev->num_ctx);
+}
+
+static int ib_qp_state_to_erdma_qp_state[IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = ERDMA_QP_STATE_IDLE,
+	[IB_QPS_INIT] = ERDMA_QP_STATE_IDLE,
+	[IB_QPS_RTR] = ERDMA_QP_STATE_RTR,
+	[IB_QPS_RTS] = ERDMA_QP_STATE_RTS,
+	[IB_QPS_SQD] = ERDMA_QP_STATE_CLOSING,
+	[IB_QPS_SQE] = ERDMA_QP_STATE_TERMINATE,
+	[IB_QPS_ERR] = ERDMA_QP_STATE_ERROR
+};
+
+int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+		    struct ib_udata *udata)
+{
+	struct erdma_qp_attrs new_attrs;
+	enum erdma_qp_attr_mask erdma_attr_mask = 0;
+	struct erdma_qp *qp = to_eqp(ibqp);
+	int ret = 0;
+
+	if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
+		return -EOPNOTSUPP;
+
+	memset(&new_attrs, 0, sizeof(new_attrs));
+
+	if (attr_mask & IB_QP_STATE) {
+		new_attrs.state = ib_qp_state_to_erdma_qp_state[attr->qp_state];
+
+		erdma_attr_mask |= ERDMA_QP_ATTR_STATE;
+	}
+
+	down_write(&qp->state_lock);
+
+	ret = erdma_modify_qp_internal(qp, &new_attrs, erdma_attr_mask);
+
+	up_write(&qp->state_lock);
+
+	return ret;
+}
+
+int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		   int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+	struct erdma_qp *qp;
+	struct erdma_dev *dev;
+
+	if (ibqp && qp_attr && qp_init_attr) {
+		qp = to_eqp(ibqp);
+		dev = to_edev(ibqp->device);
+	} else {
+		return -EINVAL;
+	}
+
+	qp_attr->cap.max_inline_data = ERDMA_MAX_INLINE;
+	qp_init_attr->cap.max_inline_data = ERDMA_MAX_INLINE;
+
+	qp_attr->cap.max_send_wr = qp->attrs.sq_size;
+	qp_attr->cap.max_recv_wr = qp->attrs.rq_size;
+	qp_attr->cap.max_send_sge = qp->attrs.max_send_sge;
+	qp_attr->cap.max_recv_sge = qp->attrs.max_recv_sge;
+
+	qp_attr->path_mtu = ib_mtu_int_to_enum(dev->netdev->mtu);
+	qp_attr->max_rd_atomic = qp->attrs.irq_size;
+	qp_attr->max_dest_rd_atomic = qp->attrs.orq_size;
+
+	qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE |
+				   IB_ACCESS_REMOTE_WRITE |
+				   IB_ACCESS_REMOTE_READ;
+
+	qp_init_attr->cap = qp_attr->cap;
+
+	return 0;
+}
+
+static int erdma_init_user_cq(struct erdma_ucontext *ctx, struct erdma_cq *cq,
+			      struct erdma_ureq_create_cq *ureq)
+{
+	int ret;
+	struct erdma_dev *dev = to_edev(cq->ibcq.device);
+
+	ret = get_mtt_entries(dev, &cq->user_cq.qbuf_mtt, ureq->qbuf_va,
+			      ureq->qbuf_len, 0, ureq->qbuf_va, SZ_64M - SZ_4K,
+			      1);
+	if (ret)
+		return ret;
+
+	ret = erdma_map_user_dbrecords(ctx, ureq->db_record_va,
+				       &cq->user_cq.user_dbr_page,
+				       &cq->user_cq.db_info_dma_addr);
+	if (ret)
+		put_mtt_entries(dev, &cq->user_cq.qbuf_mtt);
+
+	return ret;
+}
+
+static int erdma_init_kernel_cq(struct erdma_cq *cq)
+{
+	struct erdma_dev *dev = to_edev(cq->ibcq.device);
+
+	cq->kern_cq.qbuf =
+		dma_alloc_coherent(&dev->pdev->dev,
+				   WARPPED_BUFSIZE(cq->depth << CQE_SHIFT),
+				   &cq->kern_cq.qbuf_dma_addr, GFP_KERNEL);
+	if (!cq->kern_cq.qbuf)
+		return -ENOMEM;
+
+	cq->kern_cq.db_record =
+		(u64 *)(cq->kern_cq.qbuf + (cq->depth << CQE_SHIFT));
+	spin_lock_init(&cq->kern_cq.lock);
+	/* use default cqdb addr */
+	cq->kern_cq.db = dev->func_bar + ERDMA_BAR_CQDB_SPACE_OFFSET;
+
+	return 0;
+}
+
+int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		    struct ib_udata *udata)
+{
+	struct erdma_cq *cq = to_ecq(ibcq);
+	struct erdma_dev *dev = to_edev(ibcq->device);
+	unsigned int depth = attr->cqe;
+	int ret;
+	struct erdma_ucontext *ctx = rdma_udata_to_drv_context(
+		udata, struct erdma_ucontext, ibucontext);
+
+	if (depth > dev->attrs.max_cqe)
+		return -EINVAL;
+
+	depth = roundup_pow_of_two(depth);
+	cq->ibcq.cqe = depth;
+	cq->depth = depth;
+	cq->assoc_eqn = attr->comp_vector + 1;
+
+	ret = xa_alloc_cyclic(&dev->cq_xa, &cq->cqn, cq,
+			      XA_LIMIT(1, dev->attrs.max_cq - 1),
+			      &dev->next_alloc_cqn, GFP_KERNEL);
+	if (ret < 0)
+		return ret;
+
+	if (!rdma_is_kernel_res(&ibcq->res)) {
+		struct erdma_ureq_create_cq ureq;
+		struct erdma_uresp_create_cq uresp;
+
+		ret = ib_copy_from_udata(&ureq, udata,
+					 min(udata->inlen, sizeof(ureq)));
+		if (ret)
+			goto err_out_xa;
+
+		ret = erdma_init_user_cq(ctx, cq, &ureq);
+		if (ret)
+			goto err_out_xa;
+
+		uresp.cq_id = cq->cqn;
+		uresp.num_cqe = depth;
+
+		ret = ib_copy_to_udata(udata, &uresp,
+				       min(sizeof(uresp), udata->outlen));
+		if (ret)
+			goto err_free_res;
+	} else {
+		ret = erdma_init_kernel_cq(cq);
+		if (ret)
+			goto err_out_xa;
+	}
+
+	ret = create_cq_cmd(dev, cq);
+	if (ret)
+		goto err_free_res;
+
+	return 0;
+
+err_free_res:
+	if (!rdma_is_kernel_res(&ibcq->res)) {
+		erdma_unmap_user_dbrecords(ctx, &cq->user_cq.user_dbr_page);
+		put_mtt_entries(dev, &cq->user_cq.qbuf_mtt);
+	} else {
+		dma_free_coherent(&dev->pdev->dev,
+				  WARPPED_BUFSIZE(depth << CQE_SHIFT),
+				  cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr);
+	}
+
+err_out_xa:
+	xa_erase(&dev->cq_xa, cq->cqn);
+
+	return ret;
+}
+
+void erdma_port_event(struct erdma_dev *dev, enum ib_event_type reason)
+{
+	struct ib_event event;
+
+	event.device = &dev->ibdev;
+	event.element.port_num = 1;
+	event.event = reason;
+
+	ib_dispatch_event(&event);
+}
diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h
new file mode 100644
index 000000000000..c7baddb1f292
--- /dev/null
+++ b/drivers/infiniband/hw/erdma/erdma_verbs.h
@@ -0,0 +1,342 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+
+/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
+/*          Kai Shen <kaishen@linux.alibaba.com> */
+/* Copyright (c) 2020-2022, Alibaba Group. */
+
+#ifndef __ERDMA_VERBS_H__
+#define __ERDMA_VERBS_H__
+
+#include <linux/errno.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/iw_cm.h>
+
+#include "erdma.h"
+#include "erdma_cm.h"
+#include "erdma_hw.h"
+
+/* RDMA Capability. */
+#define ERDMA_MAX_PD (128 * 1024)
+#define ERDMA_MAX_SEND_WR 4096
+#define ERDMA_MAX_ORD 128
+#define ERDMA_MAX_IRD 128
+#define ERDMA_MAX_SGE_RD 1
+#define ERDMA_MAX_CONTEXT (128 * 1024)
+#define ERDMA_MAX_SEND_SGE 6
+#define ERDMA_MAX_RECV_SGE 1
+#define ERDMA_MAX_INLINE (sizeof(struct erdma_sge) * (ERDMA_MAX_SEND_SGE))
+#define ERDMA_MAX_FRMR_PA 512
+
+enum {
+	ERDMA_MMAP_IO_NC = 0, /* no cache */
+};
+
+struct erdma_user_mmap_entry {
+	struct rdma_user_mmap_entry rdma_entry;
+	u64 address;
+	u8 mmap_flag;
+};
+
+struct erdma_ucontext {
+	struct ib_ucontext ibucontext;
+
+	u32 sdb_type;
+	u32 sdb_idx;
+	u32 sdb_page_idx;
+	u32 sdb_page_off;
+	u64 sdb;
+	u64 rdb;
+	u64 cdb;
+
+	struct rdma_user_mmap_entry *sq_db_mmap_entry;
+	struct rdma_user_mmap_entry *rq_db_mmap_entry;
+	struct rdma_user_mmap_entry *cq_db_mmap_entry;
+
+	/* doorbell records */
+	struct list_head dbrecords_page_list;
+	struct mutex dbrecords_page_mutex;
+};
+
+struct erdma_pd {
+	struct ib_pd ibpd;
+	u32 pdn;
+};
+
+/*
+ * MemoryRegion definition.
+ */
+#define ERDMA_MAX_INLINE_MTT_ENTRIES 4
+#define MTT_SIZE(mtt_cnt) (mtt_cnt << 3) /* per mtt takes 8 Bytes. */
+#define ERDMA_MR_MAX_MTT_CNT 524288
+#define ERDMA_MTT_ENTRY_SIZE 8
+
+#define ERDMA_MR_TYPE_NORMAL 0
+#define ERDMA_MR_TYPE_FRMR 1
+#define ERDMA_MR_TYPE_DMA 2
+
+#define ERDMA_MR_INLINE_MTT 0
+#define ERDMA_MR_INDIRECT_MTT 1
+
+#define ERDMA_MR_ACC_LR BIT(0)
+#define ERDMA_MR_ACC_LW BIT(1)
+#define ERDMA_MR_ACC_RR BIT(2)
+#define ERDMA_MR_ACC_RW BIT(3)
+
+static inline u8 to_erdma_access_flags(int access)
+{
+	return (access & IB_ACCESS_REMOTE_READ ? ERDMA_MR_ACC_RR : 0) |
+	       (access & IB_ACCESS_LOCAL_WRITE ? ERDMA_MR_ACC_LW : 0) |
+	       (access & IB_ACCESS_REMOTE_WRITE ? ERDMA_MR_ACC_RW : 0);
+}
+
+struct erdma_mem {
+	struct ib_umem *umem;
+	void *mtt_buf;
+	u32 mtt_type;
+	u32 page_size;
+	u32 page_offset;
+	u32 page_cnt;
+	u32 mtt_nents;
+
+	u64 va;
+	u64 len;
+
+	u64 mtt_entry[ERDMA_MAX_INLINE_MTT_ENTRIES];
+};
+
+struct erdma_mr {
+	struct ib_mr ibmr;
+	struct erdma_mem mem;
+	u8 type;
+	u8 access;
+	u8 valid;
+};
+
+struct erdma_user_dbrecords_page {
+	struct list_head list;
+	struct ib_umem *umem;
+	u64 va;
+	int refcnt;
+};
+
+struct erdma_uqp {
+	struct erdma_mem sq_mtt;
+	struct erdma_mem rq_mtt;
+
+	dma_addr_t sq_db_info_dma_addr;
+	dma_addr_t rq_db_info_dma_addr;
+
+	struct erdma_user_dbrecords_page *user_dbr_page;
+
+	u32 rq_offset;
+};
+
+struct erdma_kqp {
+	u16 sq_pi;
+	u16 sq_ci;
+
+	u16 rq_pi;
+	u16 rq_ci;
+
+	u64 *swr_tbl;
+	u64 *rwr_tbl;
+
+	void __iomem *hw_sq_db;
+	void __iomem *hw_rq_db;
+
+	void *sq_buf;
+	dma_addr_t sq_buf_dma_addr;
+
+	void *rq_buf;
+	dma_addr_t rq_buf_dma_addr;
+
+	void *sq_db_info;
+	void *rq_db_info;
+
+	u8 sig_all;
+};
+
+enum erdma_qp_state {
+	ERDMA_QP_STATE_IDLE = 0,
+	ERDMA_QP_STATE_RTR = 1,
+	ERDMA_QP_STATE_RTS = 2,
+	ERDMA_QP_STATE_CLOSING = 3,
+	ERDMA_QP_STATE_TERMINATE = 4,
+	ERDMA_QP_STATE_ERROR = 5,
+	ERDMA_QP_STATE_UNDEF = 7,
+	ERDMA_QP_STATE_COUNT = 8
+};
+
+enum erdma_qp_attr_mask {
+	ERDMA_QP_ATTR_STATE = (1 << 0),
+	ERDMA_QP_ATTR_LLP_HANDLE = (1 << 2),
+	ERDMA_QP_ATTR_ORD = (1 << 3),
+	ERDMA_QP_ATTR_IRD = (1 << 4),
+	ERDMA_QP_ATTR_SQ_SIZE = (1 << 5),
+	ERDMA_QP_ATTR_RQ_SIZE = (1 << 6),
+	ERDMA_QP_ATTR_MPA = (1 << 7)
+};
+
+struct erdma_qp_attrs {
+	enum erdma_qp_state state;
+	enum erdma_cc_alg cc; /* Congestion control algorithm */
+	u32 sq_size;
+	u32 rq_size;
+	u32 orq_size;
+	u32 irq_size;
+	u32 max_send_sge;
+	u32 max_recv_sge;
+	u32 cookie;
+#define ERDMA_QP_ACTIVE 0
+#define ERDMA_QP_PASSIVE 1
+	u8 qp_type;
+	u8 pd_len;
+};
+
+struct erdma_qp {
+	struct ib_qp ibqp;
+	struct kref ref;
+	struct completion safe_free;
+	struct erdma_dev *dev;
+	struct erdma_cep *cep;
+	struct rw_semaphore state_lock;
+
+	union {
+		struct erdma_kqp kern_qp;
+		struct erdma_uqp user_qp;
+	};
+
+	struct erdma_cq *scq;
+	struct erdma_cq *rcq;
+
+	struct erdma_qp_attrs attrs;
+	spinlock_t lock;
+};
+
+struct erdma_kcq_info {
+	void *qbuf;
+	dma_addr_t qbuf_dma_addr;
+	u32 ci;
+	u32 cmdsn;
+	u32 notify_cnt;
+
+	spinlock_t lock;
+	u8 __iomem *db;
+	u64 *db_record;
+};
+
+struct erdma_ucq_info {
+	struct erdma_mem qbuf_mtt;
+	struct erdma_user_dbrecords_page *user_dbr_page;
+	dma_addr_t db_info_dma_addr;
+};
+
+struct erdma_cq {
+	struct ib_cq ibcq;
+	u32 cqn;
+
+	u32 depth;
+	u32 assoc_eqn;
+
+	union {
+		struct erdma_kcq_info kern_cq;
+		struct erdma_ucq_info user_cq;
+	};
+};
+
+#define QP_ID(qp) ((qp)->ibqp.qp_num)
+
+static inline struct erdma_qp *find_qp_by_qpn(struct erdma_dev *dev, int id)
+{
+	return (struct erdma_qp *)xa_load(&dev->qp_xa, id);
+}
+
+static inline struct erdma_cq *find_cq_by_cqn(struct erdma_dev *dev, int id)
+{
+	return (struct erdma_cq *)xa_load(&dev->cq_xa, id);
+}
+
+void erdma_qp_get(struct erdma_qp *qp);
+void erdma_qp_put(struct erdma_qp *qp);
+int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs,
+			     enum erdma_qp_attr_mask mask);
+void erdma_qp_llp_close(struct erdma_qp *qp);
+void erdma_qp_cm_drop(struct erdma_qp *qp);
+
+static inline struct erdma_ucontext *to_ectx(struct ib_ucontext *ibctx)
+{
+	return container_of(ibctx, struct erdma_ucontext, ibucontext);
+}
+
+static inline struct erdma_pd *to_epd(struct ib_pd *pd)
+{
+	return container_of(pd, struct erdma_pd, ibpd);
+}
+
+static inline struct erdma_mr *to_emr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct erdma_mr, ibmr);
+}
+
+static inline struct erdma_qp *to_eqp(struct ib_qp *qp)
+{
+	return container_of(qp, struct erdma_qp, ibqp);
+}
+
+static inline struct erdma_cq *to_ecq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct erdma_cq, ibcq);
+}
+
+static inline struct erdma_user_mmap_entry *
+to_emmap(struct rdma_user_mmap_entry *ibmmap)
+{
+	return container_of(ibmmap, struct erdma_user_mmap_entry, rdma_entry);
+}
+
+int erdma_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *data);
+void erdma_dealloc_ucontext(struct ib_ucontext *ibctx);
+int erdma_query_device(struct ib_device *dev, struct ib_device_attr *attr,
+		       struct ib_udata *data);
+int erdma_get_port_immutable(struct ib_device *dev, u32 port,
+			     struct ib_port_immutable *ib_port_immutable);
+int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		    struct ib_udata *data);
+int erdma_query_port(struct ib_device *dev, u32 port,
+		     struct ib_port_attr *attr);
+int erdma_query_gid(struct ib_device *dev, u32 port, int idx,
+		    union ib_gid *gid);
+int erdma_alloc_pd(struct ib_pd *ibpd, struct ib_udata *data);
+int erdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr,
+		    struct ib_udata *data);
+int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask,
+		   struct ib_qp_init_attr *init_attr);
+int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask,
+		    struct ib_udata *data);
+int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
+int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
+struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
+				u64 virt, int access, struct ib_udata *udata);
+struct ib_mr *erdma_get_dma_mr(struct ib_pd *ibpd, int rights);
+int erdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *data);
+int erdma_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma);
+void erdma_mmap_free(struct rdma_user_mmap_entry *rdma_entry);
+void erdma_qp_get_ref(struct ib_qp *ibqp);
+void erdma_qp_put_ref(struct ib_qp *ibqp);
+struct ib_qp *erdma_get_ibqp(struct ib_device *dev, int id);
+int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr,
+		    const struct ib_send_wr **bad_send_wr);
+int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr,
+		    const struct ib_recv_wr **bad_recv_wr);
+int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
+				u32 max_num_sg);
+int erdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+		    unsigned int *sg_offset);
+void erdma_port_event(struct erdma_dev *dev, enum ib_event_type reason);
+
+#endif
diff --git a/drivers/infiniband/hw/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig
index 6eb739052121..14b92e12bf29 100644
--- a/drivers/infiniband/hw/hfi1/Kconfig
+++ b/drivers/infiniband/hw/hfi1/Kconfig
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_HFI1
 	tristate "Cornelis OPX Gen1 support"
-	depends on X86_64 && INFINIBAND_RDMAVT && I2C
+	depends on X86_64 && INFINIBAND_RDMAVT && I2C && !UML
 	select MMU_NOTIFIER
 	select CRC32
 	select I2C_ALGOBIT
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index 2e4cf2b11653..629beff053ad 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -1179,8 +1179,10 @@ static int setup_base_ctxt(struct hfi1_filedata *fd,
 		goto done;
 
 	ret = init_user_ctxt(fd, uctxt);
-	if (ret)
+	if (ret) {
+		hfi1_free_ctxt_rcv_groups(uctxt);
 		goto done;
+	}
 
 	user_init(uctxt);
 
diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c
index d6bbdb8fcb50..5d9a7b09ca37 100644
--- a/drivers/infiniband/hw/hfi1/ipoib_tx.c
+++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c
@@ -742,9 +742,7 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv)
 				kzalloc_node(sizeof(*tx->sdma_hdr),
 					     GFP_KERNEL, priv->dd->node);
 
-		netif_tx_napi_add(dev, &txq->napi,
-				  hfi1_ipoib_poll_tx_ring,
-				  NAPI_POLL_WEIGHT);
+		netif_napi_add_tx(dev, &txq->napi, hfi1_ipoib_poll_tx_ring);
 	}
 
 	return 0;
diff --git a/drivers/infiniband/hw/hfi1/netdev_rx.c b/drivers/infiniband/hw/hfi1/netdev_rx.c
index 03b098a494b5..3dfa5aff2512 100644
--- a/drivers/infiniband/hw/hfi1/netdev_rx.c
+++ b/drivers/infiniband/hw/hfi1/netdev_rx.c
@@ -216,7 +216,7 @@ static int hfi1_netdev_rxq_init(struct hfi1_netdev_rx *rx)
 		 * right now.
 		 */
 		set_bit(NAPI_STATE_NO_BUSY_POLL, &rxq->napi.state);
-		netif_napi_add(dev, &rxq->napi, hfi1_netdev_rx_napi, 64);
+		netif_napi_add_weight(dev, &rxq->napi, hfi1_netdev_rx_napi, 64);
 		rc = msix_netdev_request_rcd_irq(rxq->rcd);
 		if (rc)
 			goto bail_context_irq_failure;
diff --git a/drivers/infiniband/hw/hfi1/pio_copy.c b/drivers/infiniband/hw/hfi1/pio_copy.c
index 136f9a99e1e0..7690f996d5e3 100644
--- a/drivers/infiniband/hw/hfi1/pio_copy.c
+++ b/drivers/infiniband/hw/hfi1/pio_copy.c
@@ -172,7 +172,7 @@ static inline void jcopy(u8 *dest, const u8 *src, u32 n)
 }
 
 /*
- * Read nbytes from "from" and and place them in the low bytes
+ * Read nbytes from "from" and place them in the low bytes
  * of pbuf->carry.  Other bytes are left as-is.  Any previous
  * value in pbuf->carry is lost.
  *
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 2855e9ad4b32..f848eedc6a23 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -959,6 +959,7 @@ struct hns_roce_dev {
 	const struct hns_roce_hw *hw;
 	void			*priv;
 	struct workqueue_struct *irq_workq;
+	struct work_struct ecc_work;
 	const struct hns_roce_dfx_hw *dfx;
 	u32 func_num;
 	u32 is_vf;
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index ba3c742258ef..cbdafaac678a 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -55,6 +55,42 @@ enum {
 	CMD_RST_PRC_EBUSY,
 };
 
+enum ecc_resource_type {
+	ECC_RESOURCE_QPC,
+	ECC_RESOURCE_CQC,
+	ECC_RESOURCE_MPT,
+	ECC_RESOURCE_SRQC,
+	ECC_RESOURCE_GMV,
+	ECC_RESOURCE_QPC_TIMER,
+	ECC_RESOURCE_CQC_TIMER,
+	ECC_RESOURCE_SCCC,
+	ECC_RESOURCE_COUNT,
+};
+
+static const struct {
+	const char *name;
+	u8 read_bt0_op;
+	u8 write_bt0_op;
+} fmea_ram_res[] = {
+	{ "ECC_RESOURCE_QPC",
+	  HNS_ROCE_CMD_READ_QPC_BT0, HNS_ROCE_CMD_WRITE_QPC_BT0 },
+	{ "ECC_RESOURCE_CQC",
+	  HNS_ROCE_CMD_READ_CQC_BT0, HNS_ROCE_CMD_WRITE_CQC_BT0 },
+	{ "ECC_RESOURCE_MPT",
+	  HNS_ROCE_CMD_READ_MPT_BT0, HNS_ROCE_CMD_WRITE_MPT_BT0 },
+	{ "ECC_RESOURCE_SRQC",
+	  HNS_ROCE_CMD_READ_SRQC_BT0, HNS_ROCE_CMD_WRITE_SRQC_BT0 },
+	/* ECC_RESOURCE_GMV is handled by cmdq, not mailbox */
+	{ "ECC_RESOURCE_GMV",
+	  0, 0 },
+	{ "ECC_RESOURCE_QPC_TIMER",
+	  HNS_ROCE_CMD_READ_QPC_TIMER_BT0, HNS_ROCE_CMD_WRITE_QPC_TIMER_BT0 },
+	{ "ECC_RESOURCE_CQC_TIMER",
+	  HNS_ROCE_CMD_READ_CQC_TIMER_BT0, HNS_ROCE_CMD_WRITE_CQC_TIMER_BT0 },
+	{ "ECC_RESOURCE_SCCC",
+	  HNS_ROCE_CMD_READ_SCCC_BT0, HNS_ROCE_CMD_WRITE_SCCC_BT0 },
+};
+
 static inline void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg,
 				   struct ib_sge *sg)
 {
@@ -5855,12 +5891,12 @@ static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq)
 		!!(eq->cons_index & eq->entries)) ? aeqe : NULL;
 }
 
-static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
-			       struct hns_roce_eq *eq)
+static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
+				       struct hns_roce_eq *eq)
 {
 	struct device *dev = hr_dev->dev;
 	struct hns_roce_aeqe *aeqe = next_aeqe_sw_v2(eq);
-	int aeqe_found = 0;
+	irqreturn_t aeqe_found = IRQ_NONE;
 	int event_type;
 	u32 queue_num;
 	int sub_type;
@@ -5914,7 +5950,7 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
 		eq->event_type = event_type;
 		eq->sub_type = sub_type;
 		++eq->cons_index;
-		aeqe_found = 1;
+		aeqe_found = IRQ_HANDLED;
 
 		hns_roce_v2_init_irq_work(hr_dev, eq, queue_num);
 
@@ -5922,7 +5958,8 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
 	}
 
 	update_eq_db(eq);
-	return aeqe_found;
+
+	return IRQ_RETVAL(aeqe_found);
 }
 
 static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq)
@@ -5937,11 +5974,11 @@ static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq)
 		!!(eq->cons_index & eq->entries)) ? ceqe : NULL;
 }
 
-static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev,
-			       struct hns_roce_eq *eq)
+static irqreturn_t hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev,
+				       struct hns_roce_eq *eq)
 {
 	struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq);
-	int ceqe_found = 0;
+	irqreturn_t ceqe_found = IRQ_NONE;
 	u32 cqn;
 
 	while (ceqe) {
@@ -5955,21 +5992,21 @@ static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev,
 		hns_roce_cq_completion(hr_dev, cqn);
 
 		++eq->cons_index;
-		ceqe_found = 1;
+		ceqe_found = IRQ_HANDLED;
 
 		ceqe = next_ceqe_sw_v2(eq);
 	}
 
 	update_eq_db(eq);
 
-	return ceqe_found;
+	return IRQ_RETVAL(ceqe_found);
 }
 
 static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr)
 {
 	struct hns_roce_eq *eq = eq_ptr;
 	struct hns_roce_dev *hr_dev = eq->hr_dev;
-	int int_work;
+	irqreturn_t int_work;
 
 	if (eq->type_flag == HNS_ROCE_CEQ)
 		/* Completion event interrupt */
@@ -5981,27 +6018,22 @@ static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr)
 	return IRQ_RETVAL(int_work);
 }
 
-static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id)
+static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev,
+					    u32 int_st)
 {
-	struct hns_roce_dev *hr_dev = dev_id;
-	struct device *dev = hr_dev->dev;
-	int int_work = 0;
-	u32 int_st;
+	struct pci_dev *pdev = hr_dev->pci_dev;
+	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
+	const struct hnae3_ae_ops *ops = ae_dev->ops;
+	irqreturn_t int_work = IRQ_NONE;
 	u32 int_en;
 
-	/* Abnormal interrupt */
-	int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG);
 	int_en = roce_read(hr_dev, ROCEE_VF_ABN_INT_EN_REG);
 
 	if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S)) {
-		struct pci_dev *pdev = hr_dev->pci_dev;
-		struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
-		const struct hnae3_ae_ops *ops = ae_dev->ops;
+		dev_err(hr_dev->dev, "AEQ overflow!\n");
 
-		dev_err(dev, "AEQ overflow!\n");
-
-		int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S;
-		roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st);
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG,
+			   1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S);
 
 		/* Set reset level for reset_event() */
 		if (ops->set_default_reset_request)
@@ -6013,19 +6045,165 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id)
 		int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S;
 		roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en);
 
-		int_work = 1;
-	} else if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_RAS_INT_S)) {
-		dev_err(dev, "RAS interrupt!\n");
-
-		int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_RAS_INT_S;
-		roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st);
-
-		int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S;
-		roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en);
-
-		int_work = 1;
+		int_work = IRQ_HANDLED;
 	} else {
-		dev_err(dev, "There is no abnormal irq found!\n");
+		dev_err(hr_dev->dev, "there is no basic abn irq found.\n");
+	}
+
+	return IRQ_RETVAL(int_work);
+}
+
+static int fmea_ram_ecc_query(struct hns_roce_dev *hr_dev,
+			       struct fmea_ram_ecc *ecc_info)
+{
+	struct hns_roce_cmq_desc desc;
+	struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data;
+	int ret;
+
+	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_QUERY_RAM_ECC, true);
+	ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+	if (ret)
+		return ret;
+
+	ecc_info->is_ecc_err = hr_reg_read(req, QUERY_RAM_ECC_1BIT_ERR);
+	ecc_info->res_type = hr_reg_read(req, QUERY_RAM_ECC_RES_TYPE);
+	ecc_info->index = hr_reg_read(req, QUERY_RAM_ECC_TAG);
+
+	return 0;
+}
+
+static int fmea_recover_gmv(struct hns_roce_dev *hr_dev, u32 idx)
+{
+	struct hns_roce_cmq_desc desc;
+	struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data;
+	u32 addr_upper;
+	u32 addr_low;
+	int ret;
+
+	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, true);
+	hr_reg_write(req, CFG_GMV_BT_IDX, idx);
+
+	ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+	if (ret) {
+		dev_err(hr_dev->dev,
+			"failed to execute cmd to read gmv, ret = %d.\n", ret);
+		return ret;
+	}
+
+	addr_low =  hr_reg_read(req, CFG_GMV_BT_BA_L);
+	addr_upper = hr_reg_read(req, CFG_GMV_BT_BA_H);
+
+	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, false);
+	hr_reg_write(req, CFG_GMV_BT_BA_L, addr_low);
+	hr_reg_write(req, CFG_GMV_BT_BA_H, addr_upper);
+	hr_reg_write(req, CFG_GMV_BT_IDX, idx);
+
+	return hns_roce_cmq_send(hr_dev, &desc, 1);
+}
+
+static u64 fmea_get_ram_res_addr(u32 res_type, __le64 *data)
+{
+	if (res_type == ECC_RESOURCE_QPC_TIMER ||
+	    res_type == ECC_RESOURCE_CQC_TIMER ||
+	    res_type == ECC_RESOURCE_SCCC)
+		return le64_to_cpu(*data);
+
+	return le64_to_cpu(*data) << PAGE_SHIFT;
+}
+
+static int fmea_recover_others(struct hns_roce_dev *hr_dev, u32 res_type,
+			       u32 index)
+{
+	u8 write_bt0_op = fmea_ram_res[res_type].write_bt0_op;
+	u8 read_bt0_op = fmea_ram_res[res_type].read_bt0_op;
+	struct hns_roce_cmd_mailbox *mailbox;
+	u64 addr;
+	int ret;
+
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, read_bt0_op, index);
+	if (ret) {
+		dev_err(hr_dev->dev,
+			"failed to execute cmd to read fmea ram, ret = %d.\n",
+			ret);
+		goto out;
+	}
+
+	addr = fmea_get_ram_res_addr(res_type, mailbox->buf);
+
+	ret = hns_roce_cmd_mbox(hr_dev, addr, 0, write_bt0_op, index);
+	if (ret)
+		dev_err(hr_dev->dev,
+			"failed to execute cmd to write fmea ram, ret = %d.\n",
+			ret);
+
+out:
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+	return ret;
+}
+
+static void fmea_ram_ecc_recover(struct hns_roce_dev *hr_dev,
+				 struct fmea_ram_ecc *ecc_info)
+{
+	u32 res_type = ecc_info->res_type;
+	u32 index = ecc_info->index;
+	int ret;
+
+	BUILD_BUG_ON(ARRAY_SIZE(fmea_ram_res) != ECC_RESOURCE_COUNT);
+
+	if (res_type >= ECC_RESOURCE_COUNT) {
+		dev_err(hr_dev->dev, "unsupported fmea ram ecc type %u.\n",
+			res_type);
+		return;
+	}
+
+	if (res_type == ECC_RESOURCE_GMV)
+		ret = fmea_recover_gmv(hr_dev, index);
+	else
+		ret = fmea_recover_others(hr_dev, res_type, index);
+	if (ret)
+		dev_err(hr_dev->dev,
+			"failed to recover %s, index = %u, ret = %d.\n",
+			fmea_ram_res[res_type].name, index, ret);
+}
+
+static void fmea_ram_ecc_work(struct work_struct *ecc_work)
+{
+	struct hns_roce_dev *hr_dev =
+		container_of(ecc_work, struct hns_roce_dev, ecc_work);
+	struct fmea_ram_ecc ecc_info = {};
+
+	if (fmea_ram_ecc_query(hr_dev, &ecc_info)) {
+		dev_err(hr_dev->dev, "failed to query fmea ram ecc.\n");
+		return;
+	}
+
+	if (!ecc_info.is_ecc_err) {
+		dev_err(hr_dev->dev, "there is no fmea ram ecc err found.\n");
+		return;
+	}
+
+	fmea_ram_ecc_recover(hr_dev, &ecc_info);
+}
+
+static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id)
+{
+	struct hns_roce_dev *hr_dev = dev_id;
+	irqreturn_t int_work = IRQ_NONE;
+	u32 int_st;
+
+	int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG);
+
+	if (int_st) {
+		int_work = abnormal_interrupt_basic(hr_dev, int_st);
+	} else if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) {
+		queue_work(hr_dev->irq_workq, &hr_dev->ecc_work);
+		int_work = IRQ_HANDLED;
+	} else {
+		dev_err(hr_dev->dev, "there is no abnormal irq found.\n");
 	}
 
 	return IRQ_RETVAL(int_work);
@@ -6342,6 +6520,8 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
 		}
 	}
 
+	INIT_WORK(&hr_dev->ecc_work, fmea_ram_ecc_work);
+
 	hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", 0);
 	if (!hr_dev->irq_workq) {
 		dev_err(dev, "failed to create irq workqueue.\n");
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index 7ffb7824d268..f96debac30fe 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -250,6 +250,7 @@ enum hns_roce_opcode_type {
 	HNS_ROCE_OPC_CFG_GMV_TBL			= 0x850f,
 	HNS_ROCE_OPC_CFG_GMV_BT				= 0x8510,
 	HNS_ROCE_OPC_EXT_CFG				= 0x8512,
+	HNS_ROCE_QUERY_RAM_ECC				= 0x8513,
 	HNS_SWITCH_PARAMETER_CFG			= 0x1033,
 };
 
@@ -1107,6 +1108,11 @@ enum {
 #define CFG_GMV_BT_BA_H CMQ_REQ_FIELD_LOC(51, 32)
 #define CFG_GMV_BT_IDX CMQ_REQ_FIELD_LOC(95, 64)
 
+/* Fields of HNS_ROCE_QUERY_RAM_ECC */
+#define QUERY_RAM_ECC_1BIT_ERR CMQ_REQ_FIELD_LOC(31, 0)
+#define QUERY_RAM_ECC_RES_TYPE CMQ_REQ_FIELD_LOC(63, 32)
+#define QUERY_RAM_ECC_TAG CMQ_REQ_FIELD_LOC(95, 64)
+
 struct hns_roce_cfg_sgid_tb {
 	__le32	table_idx_rsv;
 	__le32	vf_sgid_l;
@@ -1343,6 +1349,12 @@ struct hns_roce_dip {
 	struct list_head node; /* all dips are on a list */
 };
 
+struct fmea_ram_ecc {
+	u32	is_ecc_err;
+	u32	res_type;
+	u32	index;
+};
+
 /* only for RNR timeout issue of HIP08 */
 #define HNS_ROCE_CLOCK_ADJUST 1000
 #define HNS_ROCE_MAX_CQ_PERIOD 65
@@ -1382,7 +1394,6 @@ struct hns_roce_dip {
 #define HNS_ROCE_V2_ASYNC_EQE_NUM		0x1000
 
 #define HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S	0
-#define HNS_ROCE_V2_VF_INT_ST_RAS_INT_S		1
 
 #define HNS_ROCE_EQ_DB_CMD_AEQ			0x0
 #define HNS_ROCE_EQ_DB_CMD_AEQ_ARMED		0x1
diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c
index 646fa8677490..7b086fe63a24 100644
--- a/drivers/infiniband/hw/irdma/cm.c
+++ b/drivers/infiniband/hw/irdma/cm.c
@@ -1477,12 +1477,13 @@ irdma_find_listener(struct irdma_cm_core *cm_core, u32 *dst_addr, u16 dst_port,
 	list_for_each_entry (listen_node, &cm_core->listen_list, list) {
 		memcpy(listen_addr, listen_node->loc_addr, sizeof(listen_addr));
 		listen_port = listen_node->loc_port;
+		if (listen_port != dst_port ||
+		    !(listener_state & listen_node->listener_state))
+			continue;
 		/* compare node pair, return node handle if a match */
-		if ((!memcmp(listen_addr, dst_addr, sizeof(listen_addr)) ||
-		     !memcmp(listen_addr, ip_zero, sizeof(listen_addr))) &&
-		    listen_port == dst_port &&
-		    vlan_id == listen_node->vlan_id &&
-		    (listener_state & listen_node->listener_state)) {
+		if (!memcmp(listen_addr, ip_zero, sizeof(listen_addr)) ||
+		    (!memcmp(listen_addr, dst_addr, sizeof(listen_addr)) &&
+		     vlan_id == listen_node->vlan_id)) {
 			refcount_inc(&listen_node->refcnt);
 			spin_unlock_irqrestore(&cm_core->listen_list_lock,
 					       flags);
diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c
index 58c0e181ca2b..a41e0d21143a 100644
--- a/drivers/infiniband/hw/irdma/ctrl.c
+++ b/drivers/infiniband/hw/irdma/ctrl.c
@@ -4872,10 +4872,12 @@ int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count)
 
 		sd_diff = sd_needed - hmc_fpm_misc->max_sds;
 		if (sd_diff > 128) {
-			if (qpwanted > 128 && sd_diff > 144)
+			if (!(loop_count % 2) && qpwanted > 128) {
 				qpwanted /= 2;
-			mrwanted /= 2;
-			pblewanted /= 2;
+			} else {
+				mrwanted /= 2;
+				pblewanted /= 2;
+			}
 			continue;
 		}
 		if (dev->cqp->hmc_profile != IRDMA_HMC_PROFILE_FAVOR_VF &&
diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c
index dd3943d22dc6..4f132c6fb653 100644
--- a/drivers/infiniband/hw/irdma/hw.c
+++ b/drivers/infiniband/hw/irdma/hw.c
@@ -257,10 +257,6 @@ static void irdma_process_aeq(struct irdma_pci_f *rf)
 				iwqp->last_aeq = info->ae_id;
 			spin_unlock_irqrestore(&iwqp->lock, flags);
 			ctx_info = &iwqp->ctx_info;
-			if (rdma_protocol_roce(&iwqp->iwdev->ibdev, 1))
-				ctx_info->roce_info->err_rq_idx_valid = true;
-			else
-				ctx_info->iwarp_info->err_rq_idx_valid = true;
 		} else {
 			if (info->ae_id != IRDMA_AE_CQ_OPERATION_ERROR)
 				continue;
@@ -370,16 +366,12 @@ static void irdma_process_aeq(struct irdma_pci_f *rf)
 		case IRDMA_AE_LCE_FUNCTION_CATASTROPHIC:
 		case IRDMA_AE_LCE_CQ_CATASTROPHIC:
 		case IRDMA_AE_UDA_XMIT_DGRAM_TOO_LONG:
-			if (rdma_protocol_roce(&iwdev->ibdev, 1))
-				ctx_info->roce_info->err_rq_idx_valid = false;
-			else
-				ctx_info->iwarp_info->err_rq_idx_valid = false;
-			fallthrough;
 		default:
-			ibdev_err(&iwdev->ibdev, "abnormal ae_id = 0x%x bool qp=%d qp_id = %d\n",
-				  info->ae_id, info->qp, info->qp_cq_id);
+			ibdev_err(&iwdev->ibdev, "abnormal ae_id = 0x%x bool qp=%d qp_id = %d, ae_src=%d\n",
+				  info->ae_id, info->qp, info->qp_cq_id, info->ae_src);
 			if (rdma_protocol_roce(&iwdev->ibdev, 1)) {
-				if (!info->sq && ctx_info->roce_info->err_rq_idx_valid) {
+				ctx_info->roce_info->err_rq_idx_valid = info->rq;
+				if (info->rq) {
 					ctx_info->roce_info->err_rq_idx = info->wqe_idx;
 					irdma_sc_qp_setctx_roce(&iwqp->sc_qp, iwqp->host_ctx.va,
 								ctx_info);
@@ -388,7 +380,8 @@ static void irdma_process_aeq(struct irdma_pci_f *rf)
 				irdma_cm_disconn(iwqp);
 				break;
 			}
-			if (!info->sq && ctx_info->iwarp_info->err_rq_idx_valid) {
+			ctx_info->iwarp_info->err_rq_idx_valid = info->rq;
+			if (info->rq) {
 				ctx_info->iwarp_info->err_rq_idx = info->wqe_idx;
 				ctx_info->tcp_info_valid = false;
 				ctx_info->iwarp_info_valid = true;
@@ -1512,10 +1505,7 @@ static int irdma_hmc_setup(struct irdma_pci_f *rf)
 	int status;
 	u32 qpcnt;
 
-	if (rf->rdma_ver == IRDMA_GEN_1)
-		qpcnt = rsrc_limits_table[rf->limits_sel].qplimit * 2;
-	else
-		qpcnt = rsrc_limits_table[rf->limits_sel].qplimit;
+	qpcnt = rsrc_limits_table[rf->limits_sel].qplimit;
 
 	rf->sd_type = IRDMA_SD_TYPE_DIRECT;
 	status = irdma_cfg_fpm_val(&rf->sc_dev, qpcnt);
@@ -1543,7 +1533,7 @@ static void irdma_del_init_mem(struct irdma_pci_f *rf)
 			  rf->obj_mem.pa);
 	rf->obj_mem.va = NULL;
 	if (rf->rdma_ver != IRDMA_GEN_1) {
-		kfree(rf->allocated_ws_nodes);
+		bitmap_free(rf->allocated_ws_nodes);
 		rf->allocated_ws_nodes = NULL;
 	}
 	kfree(rf->ceqlist);
@@ -1972,9 +1962,8 @@ u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf)
 	u32 ret;
 
 	if (rf->rdma_ver != IRDMA_GEN_1) {
-		rf->allocated_ws_nodes =
-			kcalloc(BITS_TO_LONGS(IRDMA_MAX_WS_NODES),
-				sizeof(unsigned long), GFP_KERNEL);
+		rf->allocated_ws_nodes = bitmap_zalloc(IRDMA_MAX_WS_NODES,
+						       GFP_KERNEL);
 		if (!rf->allocated_ws_nodes)
 			return -ENOMEM;
 
@@ -2023,7 +2012,7 @@ u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf)
 	return 0;
 
 mem_rsrc_kzalloc_fail:
-	kfree(rf->allocated_ws_nodes);
+	bitmap_free(rf->allocated_ws_nodes);
 	rf->allocated_ws_nodes = NULL;
 
 	return ret;
diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h
index ef862bced20f..65e966ad3453 100644
--- a/drivers/infiniband/hw/irdma/main.h
+++ b/drivers/infiniband/hw/irdma/main.h
@@ -85,7 +85,7 @@ extern struct auxiliary_driver i40iw_auxiliary_drv;
 #define	IRDMA_NO_QSET	0xffff
 
 #define IW_CFG_FPM_QP_COUNT		32768
-#define IRDMA_MAX_PAGES_PER_FMR		512
+#define IRDMA_MAX_PAGES_PER_FMR		262144
 #define IRDMA_MIN_PAGES_PER_FMR		1
 #define IRDMA_CQP_COMPL_RQ_WQE_FLUSHED	2
 #define IRDMA_CQP_COMPL_SQ_WQE_FLUSHED	3
diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c
index ab3c5208a123..fdf4cc88cb91 100644
--- a/drivers/infiniband/hw/irdma/utils.c
+++ b/drivers/infiniband/hw/irdma/utils.c
@@ -652,6 +652,7 @@ static const char *const irdma_cqp_cmd_names[IRDMA_MAX_CQP_OPS] = {
 };
 
 static const struct irdma_cqp_err_info irdma_noncrit_err_list[] = {
+	{0xffff, 0x8002, "Invalid State"},
 	{0xffff, 0x8006, "Flush No Wqe Pending"},
 	{0xffff, 0x8007, "Modify QP Bad Close"},
 	{0xffff, 0x8009, "LLP Closed"},
diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c
index 96135a228f26..9b07b8af2997 100644
--- a/drivers/infiniband/hw/irdma/verbs.c
+++ b/drivers/infiniband/hw/irdma/verbs.c
@@ -1776,11 +1776,11 @@ static int irdma_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 	spin_unlock_irqrestore(&iwcq->lock, flags);
 
 	irdma_cq_wq_destroy(iwdev->rf, cq);
-	irdma_cq_free_rsrc(iwdev->rf, iwcq);
 
 	spin_lock_irqsave(&iwceq->ce_lock, flags);
 	irdma_sc_cleanup_ceqes(cq, ceq);
 	spin_unlock_irqrestore(&iwceq->ce_lock, flags);
+	irdma_cq_free_rsrc(iwdev->rf, iwcq);
 
 	return 0;
 }
@@ -2605,7 +2605,7 @@ static struct ib_mr *irdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
 	palloc = &iwpbl->pble_alloc;
 	iwmr->page_cnt = max_num_sg;
 	err_code = irdma_get_pble(iwdev->rf->pble_rsrc, palloc, iwmr->page_cnt,
-				  true);
+				  false);
 	if (err_code)
 		goto err_get_pble;
 
@@ -2641,8 +2641,16 @@ static int irdma_set_page(struct ib_mr *ibmr, u64 addr)
 	if (unlikely(iwmr->npages == iwmr->page_cnt))
 		return -ENOMEM;
 
-	pbl = palloc->level1.addr;
-	pbl[iwmr->npages++] = addr;
+	if (palloc->level == PBLE_LEVEL_2) {
+		struct irdma_pble_info *palloc_info =
+			palloc->level2.leaf + (iwmr->npages >> PBLE_512_SHIFT);
+
+		palloc_info->addr[iwmr->npages & (PBLE_PER_PAGE - 1)] = addr;
+	} else {
+		pbl = palloc->level1.addr;
+		pbl[iwmr->npages] = addr;
+	}
+	iwmr->npages++;
 
 	return 0;
 }
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 08371a80fdc2..be189e0525de 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -523,6 +523,10 @@ repoll:
 			    "Requestor" : "Responder", cq->mcq.cqn);
 		mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n",
 			    err_cqe->syndrome, err_cqe->vendor_err_synd);
+		if (wc->status != IB_WC_WR_FLUSH_ERR &&
+		    (*cur_qp)->type == MLX5_IB_QPT_REG_UMR)
+			dev->umrc.state = MLX5_UMR_STATE_RECOVER;
+
 		if (opcode == MLX5_CQE_REQ_ERR) {
 			wq = &(*cur_qp)->sq;
 			wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c
index 39ffb363ba0c..490ec308e309 100644
--- a/drivers/infiniband/hw/mlx5/fs.c
+++ b/drivers/infiniband/hw/mlx5/fs.c
@@ -679,7 +679,15 @@ enum flow_table_type {
 #define MLX5_FS_MAX_TYPES	 6
 #define MLX5_FS_MAX_ENTRIES	 BIT(16)
 
-static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns,
+static bool mlx5_ib_shared_ft_allowed(struct ib_device *device)
+{
+	struct mlx5_ib_dev *dev = to_mdev(device);
+
+	return MLX5_CAP_GEN(dev->mdev, shared_object_to_user_object_allowed);
+}
+
+static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev,
+					   struct mlx5_flow_namespace *ns,
 					   struct mlx5_ib_flow_prio *prio,
 					   int priority,
 					   int num_entries, int num_groups,
@@ -688,6 +696,8 @@ static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns,
 	struct mlx5_flow_table_attr ft_attr = {};
 	struct mlx5_flow_table *ft;
 
+	if (mlx5_ib_shared_ft_allowed(&dev->ib_dev))
+		ft_attr.uid = MLX5_SHARED_RESOURCE_UID;
 	ft_attr.prio = priority;
 	ft_attr.max_fte = num_entries;
 	ft_attr.flags = flags;
@@ -784,8 +794,8 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
 
 	ft = prio->flow_table;
 	if (!ft)
-		return _get_prio(ns, prio, priority, max_table_size, num_groups,
-				 flags);
+		return _get_prio(dev, ns, prio, priority, max_table_size,
+				 num_groups, flags);
 
 	return prio;
 }
@@ -927,7 +937,7 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num,
 
 	prio = &dev->flow_db->opfcs[type];
 	if (!prio->flow_table) {
-		prio = _get_prio(ns, prio, priority,
+		prio = _get_prio(dev, ns, prio, priority,
 				 dev->num_ports * MAX_OPFC_RULES, 1, 0);
 		if (IS_ERR(prio)) {
 			err = PTR_ERR(prio);
@@ -1407,8 +1417,8 @@ free_ucmd:
 }
 
 static struct mlx5_ib_flow_prio *
-_get_flow_table(struct mlx5_ib_dev *dev,
-		struct mlx5_ib_flow_matcher *fs_matcher,
+_get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority,
+		enum mlx5_flow_namespace_type ns_type,
 		bool mcast)
 {
 	struct mlx5_flow_namespace *ns = NULL;
@@ -1421,11 +1431,11 @@ _get_flow_table(struct mlx5_ib_dev *dev,
 	if (mcast)
 		priority = MLX5_IB_FLOW_MCAST_PRIO;
 	else
-		priority = ib_prio_to_core_prio(fs_matcher->priority, false);
+		priority = ib_prio_to_core_prio(user_priority, false);
 
 	esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
 		DEVLINK_ESWITCH_ENCAP_MODE_NONE;
-	switch (fs_matcher->ns_type) {
+	switch (ns_type) {
 	case MLX5_FLOW_NAMESPACE_BYPASS:
 		max_table_size = BIT(
 			MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, log_max_ft_size));
@@ -1452,17 +1462,17 @@ _get_flow_table(struct mlx5_ib_dev *dev,
 					       reformat_l3_tunnel_to_l2) &&
 		    esw_encap)
 			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
-		priority = fs_matcher->priority;
+		priority = user_priority;
 		break;
 	case MLX5_FLOW_NAMESPACE_RDMA_RX:
 		max_table_size = BIT(
 			MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev, log_max_ft_size));
-		priority = fs_matcher->priority;
+		priority = user_priority;
 		break;
 	case MLX5_FLOW_NAMESPACE_RDMA_TX:
 		max_table_size = BIT(
 			MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev, log_max_ft_size));
-		priority = fs_matcher->priority;
+		priority = user_priority;
 		break;
 	default:
 		break;
@@ -1470,11 +1480,11 @@ _get_flow_table(struct mlx5_ib_dev *dev,
 
 	max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES);
 
-	ns = mlx5_get_flow_namespace(dev->mdev, fs_matcher->ns_type);
+	ns = mlx5_get_flow_namespace(dev->mdev, ns_type);
 	if (!ns)
 		return ERR_PTR(-EOPNOTSUPP);
 
-	switch (fs_matcher->ns_type) {
+	switch (ns_type) {
 	case MLX5_FLOW_NAMESPACE_BYPASS:
 		prio = &dev->flow_db->prios[priority];
 		break;
@@ -1499,7 +1509,7 @@ _get_flow_table(struct mlx5_ib_dev *dev,
 	if (prio->flow_table)
 		return prio;
 
-	return _get_prio(ns, prio, priority, max_table_size,
+	return _get_prio(dev, ns, prio, priority, max_table_size,
 			 MLX5_FS_MAX_TYPES, flags);
 }
 
@@ -1618,7 +1628,8 @@ static struct mlx5_ib_flow_handler *raw_fs_rule_add(
 	mcast = raw_fs_is_multicast(fs_matcher, cmd_in);
 	mutex_lock(&dev->flow_db->lock);
 
-	ft_prio = _get_flow_table(dev, fs_matcher, mcast);
+	ft_prio = _get_flow_table(dev, fs_matcher->priority,
+				  fs_matcher->ns_type, mcast);
 	if (IS_ERR(ft_prio)) {
 		err = PTR_ERR(ft_prio);
 		goto unlock;
@@ -2015,6 +2026,23 @@ static int flow_matcher_cleanup(struct ib_uobject *uobject,
 	return 0;
 }
 
+static int steering_anchor_cleanup(struct ib_uobject *uobject,
+				   enum rdma_remove_reason why,
+				   struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_steering_anchor *obj = uobject->object;
+
+	if (atomic_read(&obj->usecnt))
+		return -EBUSY;
+
+	mutex_lock(&obj->dev->flow_db->lock);
+	put_flow_table(obj->dev, obj->ft_prio, true);
+	mutex_unlock(&obj->dev->flow_db->lock);
+
+	kfree(obj);
+	return 0;
+}
+
 static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs,
 			      struct mlx5_ib_flow_matcher *obj)
 {
@@ -2050,12 +2078,10 @@ static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs,
 		if (err)
 			return err;
 
-		if (flags) {
-			mlx5_ib_ft_type_to_namespace(
+		if (flags)
+			return mlx5_ib_ft_type_to_namespace(
 				MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX,
 				&obj->ns_type);
-			return 0;
-		}
 	}
 
 	obj->ns_type = MLX5_FLOW_NAMESPACE_BYPASS;
@@ -2121,6 +2147,75 @@ end:
 	return err;
 }
 
+static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(
+		attrs, MLX5_IB_ATTR_STEERING_ANCHOR_CREATE_HANDLE);
+	struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata);
+	enum mlx5_ib_uapi_flow_table_type ib_uapi_ft_type;
+	enum mlx5_flow_namespace_type ns_type;
+	struct mlx5_ib_steering_anchor *obj;
+	struct mlx5_ib_flow_prio *ft_prio;
+	u16 priority;
+	u32 ft_id;
+	int err;
+
+	if (!capable(CAP_NET_RAW))
+		return -EPERM;
+
+	err = uverbs_get_const(&ib_uapi_ft_type, attrs,
+			       MLX5_IB_ATTR_STEERING_ANCHOR_FT_TYPE);
+	if (err)
+		return err;
+
+	err = mlx5_ib_ft_type_to_namespace(ib_uapi_ft_type, &ns_type);
+	if (err)
+		return err;
+
+	err = uverbs_copy_from(&priority, attrs,
+			       MLX5_IB_ATTR_STEERING_ANCHOR_PRIORITY);
+	if (err)
+		return err;
+
+	obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	mutex_lock(&dev->flow_db->lock);
+	ft_prio = _get_flow_table(dev, priority, ns_type, 0);
+	if (IS_ERR(ft_prio)) {
+		mutex_unlock(&dev->flow_db->lock);
+		err = PTR_ERR(ft_prio);
+		goto free_obj;
+	}
+
+	ft_prio->refcount++;
+	ft_id = mlx5_flow_table_id(ft_prio->flow_table);
+	mutex_unlock(&dev->flow_db->lock);
+
+	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID,
+			     &ft_id, sizeof(ft_id));
+	if (err)
+		goto put_flow_table;
+
+	uobj->object = obj;
+	obj->dev = dev;
+	obj->ft_prio = ft_prio;
+	atomic_set(&obj->usecnt, 0);
+
+	return 0;
+
+put_flow_table:
+	mutex_lock(&dev->flow_db->lock);
+	put_flow_table(dev, ft_prio, true);
+	mutex_unlock(&dev->flow_db->lock);
+free_obj:
+	kfree(obj);
+
+	return err;
+}
+
 static struct ib_flow_action *
 mlx5_ib_create_modify_header(struct mlx5_ib_dev *dev,
 			     enum mlx5_ib_uapi_flow_table_type ft_type,
@@ -2477,6 +2572,35 @@ DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_FLOW_MATCHER,
 			    &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_CREATE),
 			    &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_DESTROY));
 
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_STEERING_ANCHOR_CREATE,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_STEERING_ANCHOR_CREATE_HANDLE,
+			MLX5_IB_OBJECT_STEERING_ANCHOR,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_STEERING_ANCHOR_FT_TYPE,
+			     enum mlx5_ib_uapi_flow_table_type,
+			     UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_STEERING_ANCHOR_PRIORITY,
+			   UVERBS_ATTR_TYPE(u16),
+			   UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID,
+			   UVERBS_ATTR_TYPE(u32),
+			   UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_STEERING_ANCHOR_DESTROY_HANDLE,
+			MLX5_IB_OBJECT_STEERING_ANCHOR,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+	MLX5_IB_OBJECT_STEERING_ANCHOR,
+	UVERBS_TYPE_ALLOC_IDR(steering_anchor_cleanup),
+	&UVERBS_METHOD(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE),
+	&UVERBS_METHOD(MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY));
+
 const struct uapi_definition mlx5_ib_flow_defs[] = {
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
 		MLX5_IB_OBJECT_FLOW_MATCHER),
@@ -2485,6 +2609,9 @@ const struct uapi_definition mlx5_ib_flow_defs[] = {
 		&mlx5_ib_fs),
 	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
 				&mlx5_ib_flow_actions),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+		MLX5_IB_OBJECT_STEERING_ANCHOR,
+		UAPI_DEF_IS_OBJ_SUPPORTED(mlx5_ib_shared_ft_allowed)),
 	{},
 };
 
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index b68fddeac0f1..a174a0eee8dc 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4002,7 +4002,7 @@ static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
 {
 	int err;
 
-	err = mlx5_mr_cache_cleanup(dev);
+	err = mlx5_mkey_cache_cleanup(dev);
 	if (err)
 		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
 
@@ -4022,7 +4022,7 @@ static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
 	if (ret)
 		return ret;
 
-	ret = mlx5_mr_cache_init(dev);
+	ret = mlx5_mkey_cache_init(dev);
 	if (ret) {
 		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
 		mlx5r_umr_resource_cleanup(dev);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 998b67509a53..2e2ad3918385 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -259,6 +259,12 @@ struct mlx5_ib_flow_matcher {
 	u8			match_criteria_enable;
 };
 
+struct mlx5_ib_steering_anchor {
+	struct mlx5_ib_flow_prio *ft_prio;
+	struct mlx5_ib_dev *dev;
+	atomic_t usecnt;
+};
+
 struct mlx5_ib_pp {
 	u16 index;
 	struct mlx5_core_dev *mdev;
@@ -613,6 +619,7 @@ struct mlx5_ib_mkey {
 	unsigned int ndescs;
 	struct wait_queue_head wait;
 	refcount_t usecount;
+	struct mlx5_cache_ent *cache_ent;
 };
 
 #define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)
@@ -635,20 +642,9 @@ struct mlx5_ib_mr {
 	struct ib_mr ibmr;
 	struct mlx5_ib_mkey mmkey;
 
-	/* User MR data */
-	struct mlx5_cache_ent *cache_ent;
-	/* Everything after cache_ent is zero'd when MR allocated */
 	struct ib_umem *umem;
 
 	union {
-		/* Used only while the MR is in the cache */
-		struct {
-			u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
-			struct mlx5_async_work cb_work;
-			/* Cache list element */
-			struct list_head list;
-		};
-
 		/* Used only by kernel MRs (umem == NULL) */
 		struct {
 			void *descs;
@@ -688,12 +684,6 @@ struct mlx5_ib_mr {
 	};
 };
 
-/* Zero the fields in the mr that are variant depending on usage */
-static inline void mlx5_clear_mr(struct mlx5_ib_mr *mr)
-{
-	memset_after(mr, 0, cache_ent);
-}
-
 static inline bool is_odp_mr(struct mlx5_ib_mr *mr)
 {
 	return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem &&
@@ -717,21 +707,29 @@ struct mlx5_ib_umr_context {
 	struct completion	done;
 };
 
+enum {
+	MLX5_UMR_STATE_ACTIVE,
+	MLX5_UMR_STATE_RECOVER,
+	MLX5_UMR_STATE_ERR,
+};
+
 struct umr_common {
 	struct ib_pd	*pd;
 	struct ib_cq	*cq;
 	struct ib_qp	*qp;
-	/* control access to UMR QP
+	/* Protects from UMR QP overflow
 	 */
 	struct semaphore	sem;
+	/* Protects from using UMR while the UMR is not active
+	 */
+	struct mutex lock;
+	unsigned int state;
 };
 
 struct mlx5_cache_ent {
-	struct list_head	head;
-	/* sync access to the cahce entry
-	 */
-	spinlock_t		lock;
-
+	struct xarray		mkeys;
+	unsigned long		stored;
+	unsigned long		reserved;
 
 	char                    name[4];
 	u32                     order;
@@ -743,18 +741,11 @@ struct mlx5_cache_ent {
 	u8 fill_to_high_water:1;
 
 	/*
-	 * - available_mrs is the length of list head, ie the number of MRs
-	 *   available for immediate allocation.
-	 * - total_mrs is available_mrs plus all in use MRs that could be
-	 *   returned to the cache.
-	 * - limit is the low water mark for available_mrs, 2* limit is the
+	 * - limit is the low water mark for stored mkeys, 2* limit is the
 	 *   upper water mark.
-	 * - pending is the number of MRs currently being created
 	 */
-	u32 total_mrs;
-	u32 available_mrs;
+	u32 in_use;
 	u32 limit;
-	u32 pending;
 
 	/* Statistics */
 	u32                     miss;
@@ -763,9 +754,19 @@ struct mlx5_cache_ent {
 	struct delayed_work	dwork;
 };
 
-struct mlx5_mr_cache {
+struct mlx5r_async_create_mkey {
+	union {
+		u32 in[MLX5_ST_SZ_BYTES(create_mkey_in)];
+		u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
+	};
+	struct mlx5_async_work cb_work;
+	struct mlx5_cache_ent *ent;
+	u32 mkey;
+};
+
+struct mlx5_mkey_cache {
 	struct workqueue_struct *wq;
-	struct mlx5_cache_ent	ent[MAX_MR_CACHE_ENTRIES];
+	struct mlx5_cache_ent	ent[MAX_MKEY_CACHE_ENTRIES];
 	struct dentry		*root;
 	unsigned long		last_add;
 };
@@ -1064,7 +1065,7 @@ struct mlx5_ib_dev {
 	struct mlx5_ib_resources	devr;
 
 	atomic_t			mkey_var;
-	struct mlx5_mr_cache		cache;
+	struct mlx5_mkey_cache		cache;
 	struct timer_list		delay_timer;
 	/* Prevents soft lock on massive reg MRs */
 	struct mutex			slow_path_mutex;
@@ -1309,8 +1310,8 @@ void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas,
 			  u64 access_flags);
 void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
 int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
-int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
-int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
+int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev);
+int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev);
 
 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
 				       struct mlx5_cache_ent *ent,
@@ -1338,7 +1339,7 @@ int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq);
 void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
 int __init mlx5_ib_odp_init(void);
 void mlx5_ib_odp_cleanup(void);
-void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
+void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent);
 void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
 			   struct mlx5_ib_mr *mr, int flags);
 
@@ -1357,7 +1358,7 @@ static inline int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev,
 static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {}
 static inline int mlx5_ib_odp_init(void) { return 0; }
 static inline void mlx5_ib_odp_cleanup(void)				    {}
-static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}
+static inline void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) {}
 static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
 					 struct mlx5_ib_mr *mr, int flags) {}
 
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index aedfd7ff4846..129d531bd01b 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -82,15 +82,14 @@ static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
 	MLX5_SET64(mkc, mkc, start_addr, start_addr);
 }
 
-static void assign_mkey_variant(struct mlx5_ib_dev *dev,
-				struct mlx5_ib_mkey *mkey, u32 *in)
+static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in)
 {
 	u8 key = atomic_inc_return(&dev->mkey_var);
 	void *mkc;
 
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 	MLX5_SET(mkc, mkc, mkey_7_0, key);
-	mkey->key = key;
+	*mkey = key;
 }
 
 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
@@ -98,7 +97,7 @@ static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
 {
 	int ret;
 
-	assign_mkey_variant(dev, mkey, in);
+	assign_mkey_variant(dev, &mkey->key, in);
 	ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen);
 	if (!ret)
 		init_waitqueue_head(&mkey->wait);
@@ -106,20 +105,21 @@ static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
 	return ret;
 }
 
-static int
-mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
-		       struct mlx5_ib_mkey *mkey,
-		       struct mlx5_async_ctx *async_ctx,
-		       u32 *in, int inlen, u32 *out, int outlen,
-		       struct mlx5_async_work *context)
+static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create)
 {
-	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
-	assign_mkey_variant(dev, mkey, in);
-	return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen,
-				create_mkey_callback, context);
+	struct mlx5_ib_dev *dev = async_create->ent->dev;
+	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out);
+
+	MLX5_SET(create_mkey_in, async_create->in, opcode,
+		 MLX5_CMD_OP_CREATE_MKEY);
+	assign_mkey_variant(dev, &async_create->mkey, async_create->in);
+	return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen,
+				async_create->out, outlen, create_mkey_callback,
+				&async_create->cb_work);
 }
 
-static int mr_cache_max_order(struct mlx5_ib_dev *dev);
+static int mkey_cache_max_order(struct mlx5_ib_dev *dev);
 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
 
 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
@@ -142,40 +142,132 @@ static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
 	mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out);
 }
 
+
+static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
+		     void *to_store)
+{
+	XA_STATE(xas, &ent->mkeys, 0);
+	void *curr;
+
+	xa_lock_irq(&ent->mkeys);
+	if (limit_pendings &&
+	    (ent->reserved - ent->stored) > MAX_PENDING_REG_MR) {
+		xa_unlock_irq(&ent->mkeys);
+		return -EAGAIN;
+	}
+	while (1) {
+		/*
+		 * This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version
+		 * doesn't transparently unlock. Instead we set the xas index to
+		 * the current value of reserved every iteration.
+		 */
+		xas_set(&xas, ent->reserved);
+		curr = xas_load(&xas);
+		if (!curr) {
+			if (to_store && ent->stored == ent->reserved)
+				xas_store(&xas, to_store);
+			else
+				xas_store(&xas, XA_ZERO_ENTRY);
+			if (xas_valid(&xas)) {
+				ent->reserved++;
+				if (to_store) {
+					if (ent->stored != ent->reserved)
+						__xa_store(&ent->mkeys,
+							   ent->stored,
+							   to_store,
+							   GFP_KERNEL);
+					ent->stored++;
+					queue_adjust_cache_locked(ent);
+					WRITE_ONCE(ent->dev->cache.last_add,
+						   jiffies);
+				}
+			}
+		}
+		xa_unlock_irq(&ent->mkeys);
+
+		/*
+		 * Notice xas_nomem() must always be called as it cleans
+		 * up any cached allocation.
+		 */
+		if (!xas_nomem(&xas, GFP_KERNEL))
+			break;
+		xa_lock_irq(&ent->mkeys);
+	}
+	if (xas_error(&xas))
+		return xas_error(&xas);
+	if (WARN_ON(curr))
+		return -EINVAL;
+	return 0;
+}
+
+static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent)
+{
+	void *old;
+
+	ent->reserved--;
+	old = __xa_erase(&ent->mkeys, ent->reserved);
+	WARN_ON(old);
+}
+
+static void push_to_reserved(struct mlx5_cache_ent *ent, u32 mkey)
+{
+	void *old;
+
+	old = __xa_store(&ent->mkeys, ent->stored, xa_mk_value(mkey), 0);
+	WARN_ON(old);
+	ent->stored++;
+}
+
+static u32 pop_stored_mkey(struct mlx5_cache_ent *ent)
+{
+	void *old, *xa_mkey;
+
+	ent->stored--;
+	ent->reserved--;
+
+	if (ent->stored == ent->reserved) {
+		xa_mkey = __xa_erase(&ent->mkeys, ent->stored);
+		WARN_ON(!xa_mkey);
+		return (u32)xa_to_value(xa_mkey);
+	}
+
+	xa_mkey = __xa_store(&ent->mkeys, ent->stored, XA_ZERO_ENTRY,
+			     GFP_KERNEL);
+	WARN_ON(!xa_mkey || xa_is_err(xa_mkey));
+	old = __xa_erase(&ent->mkeys, ent->reserved);
+	WARN_ON(old);
+	return (u32)xa_to_value(xa_mkey);
+}
+
 static void create_mkey_callback(int status, struct mlx5_async_work *context)
 {
-	struct mlx5_ib_mr *mr =
-		container_of(context, struct mlx5_ib_mr, cb_work);
-	struct mlx5_cache_ent *ent = mr->cache_ent;
+	struct mlx5r_async_create_mkey *mkey_out =
+		container_of(context, struct mlx5r_async_create_mkey, cb_work);
+	struct mlx5_cache_ent *ent = mkey_out->ent;
 	struct mlx5_ib_dev *dev = ent->dev;
 	unsigned long flags;
 
 	if (status) {
-		create_mkey_warn(dev, status, mr->out);
-		kfree(mr);
-		spin_lock_irqsave(&ent->lock, flags);
-		ent->pending--;
+		create_mkey_warn(dev, status, mkey_out->out);
+		kfree(mkey_out);
+		xa_lock_irqsave(&ent->mkeys, flags);
+		undo_push_reserve_mkey(ent);
 		WRITE_ONCE(dev->fill_delay, 1);
-		spin_unlock_irqrestore(&ent->lock, flags);
+		xa_unlock_irqrestore(&ent->mkeys, flags);
 		mod_timer(&dev->delay_timer, jiffies + HZ);
 		return;
 	}
 
-	mr->mmkey.type = MLX5_MKEY_MR;
-	mr->mmkey.key |= mlx5_idx_to_mkey(
-		MLX5_GET(create_mkey_out, mr->out, mkey_index));
-	init_waitqueue_head(&mr->mmkey.wait);
-
+	mkey_out->mkey |= mlx5_idx_to_mkey(
+		MLX5_GET(create_mkey_out, mkey_out->out, mkey_index));
 	WRITE_ONCE(dev->cache.last_add, jiffies);
 
-	spin_lock_irqsave(&ent->lock, flags);
-	list_add_tail(&mr->list, &ent->head);
-	ent->available_mrs++;
-	ent->total_mrs++;
+	xa_lock_irqsave(&ent->mkeys, flags);
+	push_to_reserved(ent, mkey_out->mkey);
 	/* If we are doing fill_to_high_water then keep going. */
 	queue_adjust_cache_locked(ent);
-	ent->pending--;
-	spin_unlock_irqrestore(&ent->lock, flags);
+	xa_unlock_irqrestore(&ent->mkeys, flags);
+	kfree(mkey_out);
 }
 
 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs)
@@ -197,15 +289,8 @@ static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs)
 	return ret;
 }
 
-static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
+static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
 {
-	struct mlx5_ib_mr *mr;
-
-	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
-	if (!mr)
-		return NULL;
-	mr->cache_ent = ent;
-
 	set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd);
 	MLX5_SET(mkc, mkc, free, 1);
 	MLX5_SET(mkc, mkc, umr_en, 1);
@@ -215,133 +300,106 @@ static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
 	MLX5_SET(mkc, mkc, translations_octword_size,
 		 get_mkc_octo_size(ent->access_mode, ent->ndescs));
 	MLX5_SET(mkc, mkc, log_page_size, ent->page);
-	return mr;
 }
 
 /* Asynchronously schedule new MRs to be populated in the cache. */
 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 {
-	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
-	struct mlx5_ib_mr *mr;
+	struct mlx5r_async_create_mkey *async_create;
 	void *mkc;
-	u32 *in;
 	int err = 0;
 	int i;
 
-	in = kzalloc(inlen, GFP_KERNEL);
-	if (!in)
-		return -ENOMEM;
-
-	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 	for (i = 0; i < num; i++) {
-		mr = alloc_cache_mr(ent, mkc);
-		if (!mr) {
-			err = -ENOMEM;
-			break;
-		}
-		spin_lock_irq(&ent->lock);
-		if (ent->pending >= MAX_PENDING_REG_MR) {
-			err = -EAGAIN;
-			spin_unlock_irq(&ent->lock);
-			kfree(mr);
-			break;
-		}
-		ent->pending++;
-		spin_unlock_irq(&ent->lock);
-		err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey,
-					     &ent->dev->async_ctx, in, inlen,
-					     mr->out, sizeof(mr->out),
-					     &mr->cb_work);
+		async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey),
+				       GFP_KERNEL);
+		if (!async_create)
+			return -ENOMEM;
+		mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in,
+				   memory_key_mkey_entry);
+		set_cache_mkc(ent, mkc);
+		async_create->ent = ent;
+
+		err = push_mkey(ent, true, NULL);
+		if (err)
+			goto free_async_create;
+
+		err = mlx5_ib_create_mkey_cb(async_create);
 		if (err) {
-			spin_lock_irq(&ent->lock);
-			ent->pending--;
-			spin_unlock_irq(&ent->lock);
 			mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
-			kfree(mr);
-			break;
+			goto err_undo_reserve;
 		}
 	}
 
-	kfree(in);
+	return 0;
+
+err_undo_reserve:
+	xa_lock_irq(&ent->mkeys);
+	undo_push_reserve_mkey(ent);
+	xa_unlock_irq(&ent->mkeys);
+free_async_create:
+	kfree(async_create);
 	return err;
 }
 
 /* Synchronously create a MR in the cache */
-static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent)
+static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey)
 {
 	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
-	struct mlx5_ib_mr *mr;
 	void *mkc;
 	u32 *in;
 	int err;
 
 	in = kzalloc(inlen, GFP_KERNEL);
 	if (!in)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	set_cache_mkc(ent, mkc);
 
-	mr = alloc_cache_mr(ent, mkc);
-	if (!mr) {
-		err = -ENOMEM;
-		goto free_in;
-	}
-
-	err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey.key, in, inlen);
+	err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen);
 	if (err)
-		goto free_mr;
+		goto free_in;
 
-	init_waitqueue_head(&mr->mmkey.wait);
-	mr->mmkey.type = MLX5_MKEY_MR;
 	WRITE_ONCE(ent->dev->cache.last_add, jiffies);
-	spin_lock_irq(&ent->lock);
-	ent->total_mrs++;
-	spin_unlock_irq(&ent->lock);
-	kfree(in);
-	return mr;
-free_mr:
-	kfree(mr);
 free_in:
 	kfree(in);
-	return ERR_PTR(err);
+	return err;
 }
 
 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
 {
-	struct mlx5_ib_mr *mr;
+	u32 mkey;
 
-	lockdep_assert_held(&ent->lock);
-	if (list_empty(&ent->head))
+	lockdep_assert_held(&ent->mkeys.xa_lock);
+	if (!ent->stored)
 		return;
-	mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
-	list_del(&mr->list);
-	ent->available_mrs--;
-	ent->total_mrs--;
-	spin_unlock_irq(&ent->lock);
-	mlx5_core_destroy_mkey(ent->dev->mdev, mr->mmkey.key);
-	kfree(mr);
-	spin_lock_irq(&ent->lock);
+	mkey = pop_stored_mkey(ent);
+	xa_unlock_irq(&ent->mkeys);
+	mlx5_core_destroy_mkey(ent->dev->mdev, mkey);
+	xa_lock_irq(&ent->mkeys);
 }
 
 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
 				bool limit_fill)
+	 __acquires(&ent->mkeys) __releases(&ent->mkeys)
 {
 	int err;
 
-	lockdep_assert_held(&ent->lock);
+	lockdep_assert_held(&ent->mkeys.xa_lock);
 
 	while (true) {
 		if (limit_fill)
 			target = ent->limit * 2;
-		if (target == ent->available_mrs + ent->pending)
+		if (target == ent->reserved)
 			return 0;
-		if (target > ent->available_mrs + ent->pending) {
-			u32 todo = target - (ent->available_mrs + ent->pending);
+		if (target > ent->reserved) {
+			u32 todo = target - ent->reserved;
 
-			spin_unlock_irq(&ent->lock);
+			xa_unlock_irq(&ent->mkeys);
 			err = add_keys(ent, todo);
 			if (err == -EAGAIN)
 				usleep_range(3000, 5000);
-			spin_lock_irq(&ent->lock);
+			xa_lock_irq(&ent->mkeys);
 			if (err) {
 				if (err != -EAGAIN)
 					return err;
@@ -366,15 +424,15 @@ static ssize_t size_write(struct file *filp, const char __user *buf,
 
 	/*
 	 * Target is the new value of total_mrs the user requests, however we
-	 * cannot free MRs that are in use. Compute the target value for
-	 * available_mrs.
+	 * cannot free MRs that are in use. Compute the target value for stored
+	 * mkeys.
 	 */
-	spin_lock_irq(&ent->lock);
-	if (target < ent->total_mrs - ent->available_mrs) {
+	xa_lock_irq(&ent->mkeys);
+	if (target < ent->in_use) {
 		err = -EINVAL;
 		goto err_unlock;
 	}
-	target = target - (ent->total_mrs - ent->available_mrs);
+	target = target - ent->in_use;
 	if (target < ent->limit || target > ent->limit*2) {
 		err = -EINVAL;
 		goto err_unlock;
@@ -382,12 +440,12 @@ static ssize_t size_write(struct file *filp, const char __user *buf,
 	err = resize_available_mrs(ent, target, false);
 	if (err)
 		goto err_unlock;
-	spin_unlock_irq(&ent->lock);
+	xa_unlock_irq(&ent->mkeys);
 
 	return count;
 
 err_unlock:
-	spin_unlock_irq(&ent->lock);
+	xa_unlock_irq(&ent->mkeys);
 	return err;
 }
 
@@ -398,7 +456,7 @@ static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
 	char lbuf[20];
 	int err;
 
-	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs);
+	err = snprintf(lbuf, sizeof(lbuf), "%ld\n", ent->stored + ent->in_use);
 	if (err < 0)
 		return err;
 
@@ -427,10 +485,10 @@ static ssize_t limit_write(struct file *filp, const char __user *buf,
 	 * Upon set we immediately fill the cache to high water mark implied by
 	 * the limit.
 	 */
-	spin_lock_irq(&ent->lock);
+	xa_lock_irq(&ent->mkeys);
 	ent->limit = var;
 	err = resize_available_mrs(ent, 0, true);
-	spin_unlock_irq(&ent->lock);
+	xa_unlock_irq(&ent->mkeys);
 	if (err)
 		return err;
 	return count;
@@ -457,17 +515,17 @@ static const struct file_operations limit_fops = {
 	.read	= limit_read,
 };
 
-static bool someone_adding(struct mlx5_mr_cache *cache)
+static bool someone_adding(struct mlx5_mkey_cache *cache)
 {
 	unsigned int i;
 
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
 		struct mlx5_cache_ent *ent = &cache->ent[i];
 		bool ret;
 
-		spin_lock_irq(&ent->lock);
-		ret = ent->available_mrs < ent->limit;
-		spin_unlock_irq(&ent->lock);
+		xa_lock_irq(&ent->mkeys);
+		ret = ent->stored < ent->limit;
+		xa_unlock_irq(&ent->mkeys);
 		if (ret)
 			return true;
 	}
@@ -481,26 +539,26 @@ static bool someone_adding(struct mlx5_mr_cache *cache)
  */
 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
 {
-	lockdep_assert_held(&ent->lock);
+	lockdep_assert_held(&ent->mkeys.xa_lock);
 
 	if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
 		return;
-	if (ent->available_mrs < ent->limit) {
+	if (ent->stored < ent->limit) {
 		ent->fill_to_high_water = true;
 		mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
 	} else if (ent->fill_to_high_water &&
-		   ent->available_mrs + ent->pending < 2 * ent->limit) {
+		   ent->reserved < 2 * ent->limit) {
 		/*
 		 * Once we start populating due to hitting a low water mark
 		 * continue until we pass the high water mark.
 		 */
 		mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
-	} else if (ent->available_mrs == 2 * ent->limit) {
+	} else if (ent->stored == 2 * ent->limit) {
 		ent->fill_to_high_water = false;
-	} else if (ent->available_mrs > 2 * ent->limit) {
+	} else if (ent->stored > 2 * ent->limit) {
 		/* Queue deletion of excess entries */
 		ent->fill_to_high_water = false;
-		if (ent->pending)
+		if (ent->stored != ent->reserved)
 			queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
 					   msecs_to_jiffies(1000));
 		else
@@ -511,25 +569,24 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
 static void __cache_work_func(struct mlx5_cache_ent *ent)
 {
 	struct mlx5_ib_dev *dev = ent->dev;
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	int err;
 
-	spin_lock_irq(&ent->lock);
+	xa_lock_irq(&ent->mkeys);
 	if (ent->disabled)
 		goto out;
 
-	if (ent->fill_to_high_water &&
-	    ent->available_mrs + ent->pending < 2 * ent->limit &&
+	if (ent->fill_to_high_water && ent->reserved < 2 * ent->limit &&
 	    !READ_ONCE(dev->fill_delay)) {
-		spin_unlock_irq(&ent->lock);
+		xa_unlock_irq(&ent->mkeys);
 		err = add_keys(ent, 1);
-		spin_lock_irq(&ent->lock);
+		xa_lock_irq(&ent->mkeys);
 		if (ent->disabled)
 			goto out;
 		if (err) {
 			/*
-			 * EAGAIN only happens if pending is positive, so we
-			 * will be rescheduled from reg_mr_callback(). The only
+			 * EAGAIN only happens if there are pending MRs, so we
+			 * will be rescheduled when storing them. The only
 			 * failure path here is ENOMEM.
 			 */
 			if (err != -EAGAIN) {
@@ -541,7 +598,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 						   msecs_to_jiffies(1000));
 			}
 		}
-	} else if (ent->available_mrs > 2 * ent->limit) {
+	} else if (ent->stored > 2 * ent->limit) {
 		bool need_delay;
 
 		/*
@@ -556,11 +613,11 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 		 * the garbage collection work to try to run in next cycle, in
 		 * order to free CPU resources to other tasks.
 		 */
-		spin_unlock_irq(&ent->lock);
+		xa_unlock_irq(&ent->mkeys);
 		need_delay = need_resched() || someone_adding(cache) ||
 			     !time_after(jiffies,
 					 READ_ONCE(cache->last_add) + 300 * HZ);
-		spin_lock_irq(&ent->lock);
+		xa_lock_irq(&ent->mkeys);
 		if (ent->disabled)
 			goto out;
 		if (need_delay) {
@@ -571,7 +628,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 		queue_adjust_cache_locked(ent);
 	}
 out:
-	spin_unlock_irq(&ent->lock);
+	xa_unlock_irq(&ent->mkeys);
 }
 
 static void delayed_cache_work_func(struct work_struct *work)
@@ -587,73 +644,59 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
 				       int access_flags)
 {
 	struct mlx5_ib_mr *mr;
+	int err;
 
-	/* Matches access in alloc_cache_mr() */
 	if (!mlx5r_umr_can_reconfig(dev, 0, access_flags))
 		return ERR_PTR(-EOPNOTSUPP);
 
-	spin_lock_irq(&ent->lock);
-	if (list_empty(&ent->head)) {
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	xa_lock_irq(&ent->mkeys);
+	ent->in_use++;
+
+	if (!ent->stored) {
 		queue_adjust_cache_locked(ent);
 		ent->miss++;
-		spin_unlock_irq(&ent->lock);
-		mr = create_cache_mr(ent);
-		if (IS_ERR(mr))
-			return mr;
+		xa_unlock_irq(&ent->mkeys);
+		err = create_cache_mkey(ent, &mr->mmkey.key);
+		if (err) {
+			xa_lock_irq(&ent->mkeys);
+			ent->in_use--;
+			xa_unlock_irq(&ent->mkeys);
+			kfree(mr);
+			return ERR_PTR(err);
+		}
 	} else {
-		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
-		list_del(&mr->list);
-		ent->available_mrs--;
+		mr->mmkey.key = pop_stored_mkey(ent);
 		queue_adjust_cache_locked(ent);
-		spin_unlock_irq(&ent->lock);
-
-		mlx5_clear_mr(mr);
+		xa_unlock_irq(&ent->mkeys);
 	}
+	mr->mmkey.cache_ent = ent;
+	mr->mmkey.type = MLX5_MKEY_MR;
+	init_waitqueue_head(&mr->mmkey.wait);
 	return mr;
 }
 
-static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
-{
-	struct mlx5_cache_ent *ent = mr->cache_ent;
-
-	WRITE_ONCE(dev->cache.last_add, jiffies);
-	spin_lock_irq(&ent->lock);
-	list_add_tail(&mr->list, &ent->head);
-	ent->available_mrs++;
-	queue_adjust_cache_locked(ent);
-	spin_unlock_irq(&ent->lock);
-}
-
 static void clean_keys(struct mlx5_ib_dev *dev, int c)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent = &cache->ent[c];
-	struct mlx5_ib_mr *tmp_mr;
-	struct mlx5_ib_mr *mr;
-	LIST_HEAD(del_list);
+	u32 mkey;
 
 	cancel_delayed_work(&ent->dwork);
-	while (1) {
-		spin_lock_irq(&ent->lock);
-		if (list_empty(&ent->head)) {
-			spin_unlock_irq(&ent->lock);
-			break;
-		}
-		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
-		list_move(&mr->list, &del_list);
-		ent->available_mrs--;
-		ent->total_mrs--;
-		spin_unlock_irq(&ent->lock);
-		mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
-	}
-
-	list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
-		list_del(&mr->list);
-		kfree(mr);
+	xa_lock_irq(&ent->mkeys);
+	while (ent->stored) {
+		mkey = pop_stored_mkey(ent);
+		xa_unlock_irq(&ent->mkeys);
+		mlx5_core_destroy_mkey(dev->mdev, mkey);
+		xa_lock_irq(&ent->mkeys);
 	}
+	xa_unlock_irq(&ent->mkeys);
 }
 
-static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 {
 	if (!mlx5_debugfs_root || dev->is_rep)
 		return;
@@ -662,9 +705,9 @@ static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 	dev->cache.root = NULL;
 }
 
-static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
+static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
 	struct dentry *dir;
 	int i;
@@ -674,13 +717,13 @@ static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
 
 	cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev));
 
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
 		ent = &cache->ent[i];
 		sprintf(ent->name, "%d", ent->order);
 		dir = debugfs_create_dir(ent->name, cache->root);
 		debugfs_create_file("size", 0600, dir, ent, &size_fops);
 		debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
-		debugfs_create_u32("cur", 0400, dir, &ent->available_mrs);
+		debugfs_create_ulong("cur", 0400, dir, &ent->stored);
 		debugfs_create_u32("miss", 0600, dir, &ent->miss);
 	}
 }
@@ -692,9 +735,9 @@ static void delay_time_func(struct timer_list *t)
 	WRITE_ONCE(dev->fill_delay, 0);
 }
 
-int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
+int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
 	int i;
 
@@ -707,22 +750,21 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 
 	mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
 	timer_setup(&dev->delay_timer, delay_time_func, 0);
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
 		ent = &cache->ent[i];
-		INIT_LIST_HEAD(&ent->head);
-		spin_lock_init(&ent->lock);
+		xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
 		ent->order = i + 2;
 		ent->dev = dev;
 		ent->limit = 0;
 
 		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
 
-		if (i > MR_CACHE_LAST_STD_ENTRY) {
-			mlx5_odp_init_mr_cache_entry(ent);
+		if (i > MKEY_CACHE_LAST_STD_ENTRY) {
+			mlx5_odp_init_mkey_cache_entry(ent);
 			continue;
 		}
 
-		if (ent->order > mr_cache_max_order(dev))
+		if (ent->order > mkey_cache_max_order(dev))
 			continue;
 
 		ent->page = PAGE_SHIFT;
@@ -734,36 +776,36 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 			ent->limit = dev->mdev->profile.mr_cache[i].limit;
 		else
 			ent->limit = 0;
-		spin_lock_irq(&ent->lock);
+		xa_lock_irq(&ent->mkeys);
 		queue_adjust_cache_locked(ent);
-		spin_unlock_irq(&ent->lock);
+		xa_unlock_irq(&ent->mkeys);
 	}
 
-	mlx5_mr_cache_debugfs_init(dev);
+	mlx5_mkey_cache_debugfs_init(dev);
 
 	return 0;
 }
 
-int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
+int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
 {
 	unsigned int i;
 
 	if (!dev->cache.wq)
 		return 0;
 
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
 		struct mlx5_cache_ent *ent = &dev->cache.ent[i];
 
-		spin_lock_irq(&ent->lock);
+		xa_lock_irq(&ent->mkeys);
 		ent->disabled = true;
-		spin_unlock_irq(&ent->lock);
+		xa_unlock_irq(&ent->mkeys);
 		cancel_delayed_work_sync(&ent->dwork);
 	}
 
-	mlx5_mr_cache_debugfs_cleanup(dev);
+	mlx5_mkey_cache_debugfs_cleanup(dev);
 	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
 
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++)
 		clean_keys(dev, i);
 
 	destroy_workqueue(dev->cache.wq);
@@ -830,22 +872,22 @@ static int get_octo_len(u64 addr, u64 len, int page_shift)
 	return (npages + 1) / 2;
 }
 
-static int mr_cache_max_order(struct mlx5_ib_dev *dev)
+static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
 {
 	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
-		return MR_CACHE_LAST_STD_ENTRY + 2;
+		return MKEY_CACHE_LAST_STD_ENTRY + 2;
 	return MLX5_MAX_UMR_SHIFT;
 }
 
-static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev,
-						      unsigned int order)
+static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev,
+							unsigned int order)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 
 	if (order < cache->ent[0].order)
 		return &cache->ent[0];
 	order = order - cache->ent[0].order;
-	if (order > MR_CACHE_LAST_STD_ENTRY)
+	if (order > MKEY_CACHE_LAST_STD_ENTRY)
 		return NULL;
 	return &cache->ent[order];
 }
@@ -888,7 +930,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 						     0, iova);
 	if (WARN_ON(!page_size))
 		return ERR_PTR(-EINVAL);
-	ent = mr_cache_ent_from_order(
+	ent = mkey_cache_ent_from_order(
 		dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size)));
 	/*
 	 * Matches access in alloc_cache_mr(). If the MR can't come from the
@@ -1320,7 +1362,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
 
 	/* We only track the allocated sizes of MRs from the cache */
-	if (!mr->cache_ent)
+	if (!mr->mmkey.cache_ent)
 		return false;
 	if (!mlx5r_umr_can_load_pas(dev, new_umem->length))
 		return false;
@@ -1329,7 +1371,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
 		mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova);
 	if (WARN_ON(!*page_size))
 		return false;
-	return (1ULL << mr->cache_ent->order) >=
+	return (1ULL << mr->mmkey.cache_ent->order) >=
 	       ib_umem_num_dma_blocks(new_umem, *page_size);
 }
 
@@ -1570,15 +1612,17 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 	}
 
 	/* Stop DMA */
-	if (mr->cache_ent) {
-		if (mlx5r_umr_revoke_mr(mr)) {
-			spin_lock_irq(&mr->cache_ent->lock);
-			mr->cache_ent->total_mrs--;
-			spin_unlock_irq(&mr->cache_ent->lock);
-			mr->cache_ent = NULL;
-		}
+	if (mr->mmkey.cache_ent) {
+		xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
+		mr->mmkey.cache_ent->in_use--;
+		xa_unlock_irq(&mr->mmkey.cache_ent->mkeys);
+
+		if (mlx5r_umr_revoke_mr(mr) ||
+		    push_mkey(mr->mmkey.cache_ent, false,
+			      xa_mk_value(mr->mmkey.key)))
+			mr->mmkey.cache_ent = NULL;
 	}
-	if (!mr->cache_ent) {
+	if (!mr->mmkey.cache_ent) {
 		rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
 		if (rc)
 			return rc;
@@ -1595,12 +1639,10 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 			mlx5_ib_free_odp_mr(mr);
 	}
 
-	if (mr->cache_ent) {
-		mlx5_mr_cache_free(dev, mr);
-	} else {
+	if (!mr->mmkey.cache_ent)
 		mlx5_free_priv_descs(mr);
-		kfree(mr);
-	}
+
+	kfree(mr);
 	return 0;
 }
 
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 84da5674e1ab..e305bf1dc6c2 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -1588,7 +1588,7 @@ mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 	return err;
 }
 
-void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
+void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent)
 {
 	if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
 		return;
diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c
index 3a48364c0918..e00b94d1b1ea 100644
--- a/drivers/infiniband/hw/mlx5/umr.c
+++ b/drivers/infiniband/hw/mlx5/umr.c
@@ -176,6 +176,7 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev)
 	dev->umrc.pd = pd;
 
 	sema_init(&dev->umrc.sem, MAX_UMR_WR);
+	mutex_init(&dev->umrc.lock);
 
 	return 0;
 
@@ -195,6 +196,31 @@ void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev)
 	ib_dealloc_pd(dev->umrc.pd);
 }
 
+static int mlx5r_umr_recover(struct mlx5_ib_dev *dev)
+{
+	struct umr_common *umrc = &dev->umrc;
+	struct ib_qp_attr attr;
+	int err;
+
+	attr.qp_state = IB_QPS_RESET;
+	err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
+	if (err) {
+		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
+		goto err;
+	}
+
+	err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
+	if (err)
+		goto err;
+
+	umrc->state = MLX5_UMR_STATE_ACTIVE;
+	return 0;
+
+err:
+	umrc->state = MLX5_UMR_STATE_ERR;
+	return err;
+}
+
 static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
 			       struct mlx5r_umr_wqe *wqe, bool with_data)
 {
@@ -231,7 +257,7 @@ static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
 
 	id.ib_cqe = cqe;
 	mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0,
-			 MLX5_FENCE_MODE_NONE, MLX5_OPCODE_UMR);
+			 MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR);
 
 	mlx5r_ring_db(qp, 1, ctrl);
 
@@ -270,17 +296,49 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
 	mlx5r_umr_init_context(&umr_context);
 
 	down(&umrc->sem);
-	err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
-				  with_data);
-	if (err)
-		mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err);
-	else {
-		wait_for_completion(&umr_context.done);
-		if (umr_context.status != IB_WC_SUCCESS) {
-			mlx5_ib_warn(dev, "reg umr failed (%u)\n",
-				     umr_context.status);
+	while (true) {
+		mutex_lock(&umrc->lock);
+		if (umrc->state == MLX5_UMR_STATE_ERR) {
+			mutex_unlock(&umrc->lock);
 			err = -EFAULT;
+			break;
 		}
+
+		if (umrc->state == MLX5_UMR_STATE_RECOVER) {
+			mutex_unlock(&umrc->lock);
+			usleep_range(3000, 5000);
+			continue;
+		}
+
+		err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
+					  with_data);
+		mutex_unlock(&umrc->lock);
+		if (err) {
+			mlx5_ib_warn(dev, "UMR post send failed, err %d\n",
+				     err);
+			break;
+		}
+
+		wait_for_completion(&umr_context.done);
+
+		if (umr_context.status == IB_WC_SUCCESS)
+			break;
+
+		if (umr_context.status == IB_WC_WR_FLUSH_ERR)
+			continue;
+
+		WARN_ON_ONCE(1);
+		mlx5_ib_warn(dev,
+			"reg umr failed (%u). Trying to recover and resubmit the flushed WQEs\n",
+			umr_context.status);
+		mutex_lock(&umrc->lock);
+		err = mlx5r_umr_recover(dev);
+		mutex_unlock(&umrc->lock);
+		if (err)
+			mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n",
+				     err);
+		err = -EFAULT;
+		break;
 	}
 	up(&umrc->sem);
 	return err;
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index 03ed7c0fae50..d745ce9dc88a 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -3084,7 +3084,7 @@ static struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd,
 		else
 			DP_ERR(dev, "roce alloc tid returned error %d\n", rc);
 
-		goto err0;
+		goto err1;
 	}
 
 	/* Index only, 18 bit long, lkey = itid << 8 | key */
@@ -3108,7 +3108,7 @@ static struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd,
 	rc = dev->ops->rdma_register_tid(dev->rdma_ctx, &mr->hw_mr);
 	if (rc) {
 		DP_ERR(dev, "roce register tid returned an error %d\n", rc);
-		goto err1;
+		goto err2;
 	}
 
 	mr->ibmr.lkey = mr->hw_mr.itid << 8 | mr->hw_mr.key;
@@ -3117,8 +3117,10 @@ static struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd,
 	DP_DEBUG(dev, QEDR_MSG_MR, "alloc frmr: %x\n", mr->ibmr.lkey);
 	return mr;
 
-err1:
+err2:
 	dev->ops->rdma_free_tid(dev->rdma_ctx, mr->hw_mr.itid);
+err1:
+	qedr_free_pbl(dev, &mr->info.pbl_info, mr->info.pbl_table);
 err0:
 	kfree(mr);
 	return ERR_PTR(rc);
diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h
index b37b1c6d35c6..26c615772be3 100644
--- a/drivers/infiniband/hw/qib/qib.h
+++ b/drivers/infiniband/hw/qib/qib.h
@@ -321,7 +321,7 @@ struct qib_verbs_txreq {
  * These 7 values (SDR, DDR, and QDR may be ORed for auto-speed
  * negotiation) are used for the 3rd argument to path_f_set_ib_cfg
  * with cmd QIB_IB_CFG_SPD_ENB, by direct calls or via sysfs.  They
- * are also the the possible values for qib_link_speed_enabled and active
+ * are also the possible values for qib_link_speed_enabled and active
  * The values were chosen to match values used within the IB spec.
  */
 #define QIB_IB_SDR 1
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index aa290928cf96..3937144b2ae5 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -153,7 +153,7 @@ static int qib_get_base_info(struct file *fp, void __user *ubase,
 		kinfo->spi_tidcnt += dd->rcvtidcnt % subctxt_cnt;
 	/*
 	 * for this use, may be cfgctxts summed over all chips that
-	 * are are configured and present
+	 * are configured and present
 	 */
 	kinfo->spi_nctxts = dd->cfgctxts;
 	/* unit (chip/board) our context is on */
@@ -851,7 +851,7 @@ static int mmap_rcvegrbufs(struct vm_area_struct *vma,
 		ret = -EPERM;
 		goto bail;
 	}
-	/* don't allow them to later change to writeable with mprotect */
+	/* don't allow them to later change to writable with mprotect */
 	vma->vm_flags &= ~VM_MAYWRITE;
 
 	start = vma->vm_start;
@@ -941,7 +941,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
 			goto bail;
 		}
 		/*
-		 * Don't allow permission to later change to writeable
+		 * Don't allow permission to later change to writable
 		 * with mprotect.
 		 */
 		vma->vm_flags &= ~VM_MAYWRITE;
diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c
index 37b628a162e0..6af57067c32e 100644
--- a/drivers/infiniband/hw/qib/qib_iba7220.c
+++ b/drivers/infiniband/hw/qib/qib_iba7220.c
@@ -58,7 +58,7 @@ static void qib_set_ib_7220_lstate(struct qib_pportdata *, u16, u16);
 /*
  * This file contains almost all the chip-specific register information and
  * access functions for the QLogic QLogic_IB 7220 PCI-Express chip, with the
- * exception of SerDes support, which in in qib_sd7220.c.
+ * exception of SerDes support, which in qib_sd7220.c.
  */
 
 /* Below uses machine-generated qib_chipnum_regs.h file */
diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c
index ceed302cf6a0..6861c6384f18 100644
--- a/drivers/infiniband/hw/qib/qib_iba7322.c
+++ b/drivers/infiniband/hw/qib/qib_iba7322.c
@@ -2850,9 +2850,9 @@ static void qib_setup_7322_cleanup(struct qib_devdata *dd)
 
 	qib_7322_free_irq(dd);
 	kfree(dd->cspec->cntrs);
-	kfree(dd->cspec->sendchkenable);
-	kfree(dd->cspec->sendgrhchk);
-	kfree(dd->cspec->sendibchk);
+	bitmap_free(dd->cspec->sendchkenable);
+	bitmap_free(dd->cspec->sendgrhchk);
+	bitmap_free(dd->cspec->sendibchk);
 	kfree(dd->cspec->msix_entries);
 	for (i = 0; i < dd->num_pports; i++) {
 		unsigned long flags;
@@ -6383,18 +6383,11 @@ static int qib_init_7322_variables(struct qib_devdata *dd)
 	features = qib_7322_boardname(dd);
 
 	/* now that piobcnt2k and 4k set, we can allocate these */
-	sbufcnt = dd->piobcnt2k + dd->piobcnt4k +
-		NUM_VL15_BUFS + BITS_PER_LONG - 1;
-	sbufcnt /= BITS_PER_LONG;
-	dd->cspec->sendchkenable =
-		kmalloc_array(sbufcnt, sizeof(*dd->cspec->sendchkenable),
-			      GFP_KERNEL);
-	dd->cspec->sendgrhchk =
-		kmalloc_array(sbufcnt, sizeof(*dd->cspec->sendgrhchk),
-			      GFP_KERNEL);
-	dd->cspec->sendibchk =
-		kmalloc_array(sbufcnt, sizeof(*dd->cspec->sendibchk),
-			      GFP_KERNEL);
+	sbufcnt = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS;
+
+	dd->cspec->sendchkenable = bitmap_zalloc(sbufcnt, GFP_KERNEL);
+	dd->cspec->sendgrhchk = bitmap_zalloc(sbufcnt, GFP_KERNEL);
+	dd->cspec->sendibchk = bitmap_zalloc(sbufcnt, GFP_KERNEL);
 	if (!dd->cspec->sendchkenable || !dd->cspec->sendgrhchk ||
 		!dd->cspec->sendibchk) {
 		ret = -ENOMEM;
diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c
index d1a72e89e297..45211008449f 100644
--- a/drivers/infiniband/hw/qib/qib_init.c
+++ b/drivers/infiniband/hw/qib/qib_init.c
@@ -1106,8 +1106,7 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra)
 	if (!qib_cpulist_count) {
 		u32 count = num_online_cpus();
 
-		qib_cpulist = kcalloc(BITS_TO_LONGS(count), sizeof(long),
-				      GFP_KERNEL);
+		qib_cpulist = bitmap_zalloc(count, GFP_KERNEL);
 		if (qib_cpulist)
 			qib_cpulist_count = count;
 	}
@@ -1279,7 +1278,7 @@ static void __exit qib_ib_cleanup(void)
 #endif
 
 	qib_cpulist_count = 0;
-	kfree(qib_cpulist);
+	bitmap_free(qib_cpulist);
 
 	WARN_ON(!xa_empty(&qib_dev_table));
 	qib_dev_cleanup();
diff --git a/drivers/infiniband/hw/qib/qib_sd7220.c b/drivers/infiniband/hw/qib/qib_sd7220.c
index 81b810d006c0..1dc3ccf0cf1f 100644
--- a/drivers/infiniband/hw/qib/qib_sd7220.c
+++ b/drivers/infiniband/hw/qib/qib_sd7220.c
@@ -587,7 +587,7 @@ static int epb_access(struct qib_devdata *dd, int sdnum, int claim)
 		/* Need to release */
 		u64 pollval;
 		/*
-		 * The only writeable bits are the request and CS.
+		 * The only writable bits are the request and CS.
 		 * Both should be clear
 		 */
 		u64 newval = 0;
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index e212929369df..67a1b4562dc2 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -482,7 +482,7 @@ int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev)
 	if (err)
 		goto out_free_dev;
 
-	if (!iommu_capable(dev->bus, IOMMU_CAP_CACHE_COHERENCY)) {
+	if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) {
 		usnic_err("IOMMU of %s does not support cache coherency\n",
 				dev_name(dev));
 		err = -EINVAL;
diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
index da3a398053b8..fb0c008af78c 100644
--- a/drivers/infiniband/sw/rxe/rxe_comp.c
+++ b/drivers/infiniband/sw/rxe/rxe_comp.c
@@ -114,6 +114,8 @@ void retransmit_timer(struct timer_list *t)
 {
 	struct rxe_qp *qp = from_timer(qp, t, retrans_timer);
 
+	pr_debug("%s: fired for qp#%d\n", __func__, qp->elem.index);
+
 	if (qp->valid) {
 		qp->comp.timeout = 1;
 		rxe_run_task(&qp->comp.task, 1);
@@ -560,17 +562,16 @@ int rxe_completer(void *arg)
 	struct sk_buff *skb = NULL;
 	struct rxe_pkt_info *pkt = NULL;
 	enum comp_state state;
-	int ret = 0;
+	int ret;
 
 	if (!rxe_get(qp))
 		return -EAGAIN;
 
-	if (!qp->valid || qp->req.state == QP_STATE_ERROR ||
-	    qp->req.state == QP_STATE_RESET) {
+	if (!qp->valid || qp->comp.state == QP_STATE_ERROR ||
+	    qp->comp.state == QP_STATE_RESET) {
 		rxe_drain_resp_pkts(qp, qp->valid &&
-				    qp->req.state == QP_STATE_ERROR);
-		ret = -EAGAIN;
-		goto done;
+				    qp->comp.state == QP_STATE_ERROR);
+		goto exit;
 	}
 
 	if (qp->comp.timeout) {
@@ -580,10 +581,8 @@ int rxe_completer(void *arg)
 		qp->comp.timeout_retry = 0;
 	}
 
-	if (qp->req.need_retry) {
-		ret = -EAGAIN;
-		goto done;
-	}
+	if (qp->req.need_retry)
+		goto exit;
 
 	state = COMPST_GET_ACK;
 
@@ -676,8 +675,7 @@ int rxe_completer(void *arg)
 			    qp->qp_timeout_jiffies)
 				mod_timer(&qp->retrans_timer,
 					  jiffies + qp->qp_timeout_jiffies);
-			ret = -EAGAIN;
-			goto done;
+			goto exit;
 
 		case COMPST_ERROR_RETRY:
 			/* we come here if the retry timer fired and we did
@@ -689,10 +687,8 @@ int rxe_completer(void *arg)
 			 */
 
 			/* there is nothing to retry in this case */
-			if (!wqe || (wqe->state == wqe_state_posted)) {
-				ret = -EAGAIN;
-				goto done;
-			}
+			if (!wqe || (wqe->state == wqe_state_posted))
+				goto exit;
 
 			/* if we've started a retry, don't start another
 			 * retry sequence, unless this is a timeout.
@@ -730,18 +726,21 @@ int rxe_completer(void *arg)
 			break;
 
 		case COMPST_RNR_RETRY:
+			/* we come here if we received an RNR NAK */
 			if (qp->comp.rnr_retry > 0) {
 				if (qp->comp.rnr_retry != 7)
 					qp->comp.rnr_retry--;
 
-				qp->req.need_retry = 1;
+				/* don't start a retry flow until the
+				 * rnr timer has fired
+				 */
+				qp->req.wait_for_rnr_timer = 1;
 				pr_debug("qp#%d set rnr nak timer\n",
 					 qp_num(qp));
 				mod_timer(&qp->rnr_nak_timer,
 					  jiffies + rnrnak_jiffies(aeth_syn(pkt)
 						& ~AETH_TYPE_MASK));
-				ret = -EAGAIN;
-				goto done;
+				goto exit;
 			} else {
 				rxe_counter_inc(rxe,
 						RXE_CNT_RNR_RETRY_EXCEEDED);
@@ -754,12 +753,20 @@ int rxe_completer(void *arg)
 			WARN_ON_ONCE(wqe->status == IB_WC_SUCCESS);
 			do_complete(qp, wqe);
 			rxe_qp_error(qp);
-			ret = -EAGAIN;
-			goto done;
+			goto exit;
 		}
 	}
 
+	/* A non-zero return value will cause rxe_do_task to
+	 * exit its loop and end the tasklet. A zero return
+	 * will continue looping and return to rxe_completer
+	 */
 done:
+	ret = 0;
+	goto out;
+exit:
+	ret = -EAGAIN;
+out:
 	if (pkt)
 		free_pkt(pkt);
 	rxe_put(qp);
diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c
index 642b52539ac3..b1a0ab3cd4bd 100644
--- a/drivers/infiniband/sw/rxe/rxe_cq.c
+++ b/drivers/infiniband/sw/rxe/rxe_cq.c
@@ -19,16 +19,16 @@ int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq,
 	}
 
 	if (cqe > rxe->attr.max_cqe) {
-		pr_warn("cqe(%d) > max_cqe(%d)\n",
-			cqe, rxe->attr.max_cqe);
+		pr_debug("cqe(%d) > max_cqe(%d)\n",
+				cqe, rxe->attr.max_cqe);
 		goto err1;
 	}
 
 	if (cq) {
 		count = queue_count(cq->queue, QUEUE_TYPE_TO_CLIENT);
 		if (cqe < count) {
-			pr_warn("cqe(%d) < current # elements in queue (%d)",
-				cqe, count);
+			pr_debug("cqe(%d) < current # elements in queue (%d)",
+					cqe, count);
 			goto err1;
 		}
 	}
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
index 0e022ae1b8a5..22f6cc31d1d6 100644
--- a/drivers/infiniband/sw/rxe/rxe_loc.h
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -77,9 +77,8 @@ struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key,
 			 enum rxe_mr_lookup_type type);
 int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length);
 int advance_dma_data(struct rxe_dma_info *dma, unsigned int length);
-int rxe_invalidate_mr(struct rxe_qp *qp, u32 rkey);
+int rxe_invalidate_mr(struct rxe_qp *qp, u32 key);
 int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe);
-int rxe_mr_set_page(struct ib_mr *ibmr, u64 addr);
 int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
 void rxe_mr_cleanup(struct rxe_pool_elem *elem);
 
@@ -145,7 +144,7 @@ static inline int rcv_wqe_size(int max_sge)
 		max_sge * sizeof(struct ib_sge);
 }
 
-void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res);
+void free_rd_atomic_resource(struct resp_res *res);
 
 static inline void rxe_advance_resp_resource(struct rxe_qp *qp)
 {
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index fc3942e04a1f..850b80f5ad8b 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -24,7 +24,7 @@ u8 rxe_get_next_key(u32 last_key)
 
 int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length)
 {
-	struct rxe_map_set *set = mr->cur_map_set;
+
 
 	switch (mr->type) {
 	case IB_MR_TYPE_DMA:
@@ -32,8 +32,8 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length)
 
 	case IB_MR_TYPE_USER:
 	case IB_MR_TYPE_MEM_REG:
-		if (iova < set->iova || length > set->length ||
-		    iova > set->iova + set->length - length)
+		if (iova < mr->iova || length > mr->length ||
+		    iova > mr->iova + mr->length - length)
 			return -EFAULT;
 		return 0;
 
@@ -65,89 +65,41 @@ static void rxe_mr_init(int access, struct rxe_mr *mr)
 	mr->map_shift = ilog2(RXE_BUF_PER_MAP);
 }
 
-static void rxe_mr_free_map_set(int num_map, struct rxe_map_set *set)
+static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf)
 {
 	int i;
+	int num_map;
+	struct rxe_map **map = mr->map;
 
-	for (i = 0; i < num_map; i++)
-		kfree(set->map[i]);
+	num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP;
 
-	kfree(set->map);
-	kfree(set);
-}
-
-static int rxe_mr_alloc_map_set(int num_map, struct rxe_map_set **setp)
-{
-	int i;
-	struct rxe_map_set *set;
-
-	set = kmalloc(sizeof(*set), GFP_KERNEL);
-	if (!set)
-		goto err_out;
-
-	set->map = kmalloc_array(num_map, sizeof(struct rxe_map *), GFP_KERNEL);
-	if (!set->map)
-		goto err_free_set;
+	mr->map = kmalloc_array(num_map, sizeof(*map), GFP_KERNEL);
+	if (!mr->map)
+		goto err1;
 
 	for (i = 0; i < num_map; i++) {
-		set->map[i] = kmalloc(sizeof(struct rxe_map), GFP_KERNEL);
-		if (!set->map[i])
-			goto err_free_map;
+		mr->map[i] = kmalloc(sizeof(**map), GFP_KERNEL);
+		if (!mr->map[i])
+			goto err2;
 	}
 
-	*setp = set;
-
-	return 0;
-
-err_free_map:
-	for (i--; i >= 0; i--)
-		kfree(set->map[i]);
-
-	kfree(set->map);
-err_free_set:
-	kfree(set);
-err_out:
-	return -ENOMEM;
-}
-
-/**
- * rxe_mr_alloc() - Allocate memory map array(s) for MR
- * @mr: Memory region
- * @num_buf: Number of buffer descriptors to support
- * @both: If non zero allocate both mr->map and mr->next_map
- *	  else just allocate mr->map. Used for fast MRs
- *
- * Return: 0 on success else an error
- */
-static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf, int both)
-{
-	int ret;
-	int num_map;
-
 	BUILD_BUG_ON(!is_power_of_2(RXE_BUF_PER_MAP));
-	num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP;
 
 	mr->map_shift = ilog2(RXE_BUF_PER_MAP);
 	mr->map_mask = RXE_BUF_PER_MAP - 1;
+
 	mr->num_buf = num_buf;
-	mr->max_buf = num_map * RXE_BUF_PER_MAP;
 	mr->num_map = num_map;
-
-	ret = rxe_mr_alloc_map_set(num_map, &mr->cur_map_set);
-	if (ret)
-		return -ENOMEM;
-
-	if (both) {
-		ret = rxe_mr_alloc_map_set(num_map, &mr->next_map_set);
-		if (ret)
-			goto err_free;
-	}
+	mr->max_buf = num_map * RXE_BUF_PER_MAP;
 
 	return 0;
 
-err_free:
-	rxe_mr_free_map_set(mr->num_map, mr->cur_map_set);
-	mr->cur_map_set = NULL;
+err2:
+	for (i--; i >= 0; i--)
+		kfree(mr->map[i]);
+
+	kfree(mr->map);
+err1:
 	return -ENOMEM;
 }
 
@@ -164,7 +116,6 @@ void rxe_mr_init_dma(struct rxe_pd *pd, int access, struct rxe_mr *mr)
 int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova,
 		     int access, struct rxe_mr *mr)
 {
-	struct rxe_map_set	*set;
 	struct rxe_map		**map;
 	struct rxe_phys_buf	*buf = NULL;
 	struct ib_umem		*umem;
@@ -172,6 +123,7 @@ int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova,
 	int			num_buf;
 	void			*vaddr;
 	int err;
+	int i;
 
 	umem = ib_umem_get(pd->ibpd.device, start, length, access);
 	if (IS_ERR(umem)) {
@@ -185,20 +137,18 @@ int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova,
 
 	rxe_mr_init(access, mr);
 
-	err = rxe_mr_alloc(mr, num_buf, 0);
+	err = rxe_mr_alloc(mr, num_buf);
 	if (err) {
 		pr_warn("%s: Unable to allocate memory for map\n",
 				__func__);
 		goto err_release_umem;
 	}
 
-	set = mr->cur_map_set;
-	set->page_shift = PAGE_SHIFT;
-	set->page_mask = PAGE_SIZE - 1;
-
-	num_buf = 0;
-	map = set->map;
+	mr->page_shift = PAGE_SHIFT;
+	mr->page_mask = PAGE_SIZE - 1;
 
+	num_buf			= 0;
+	map = mr->map;
 	if (length > 0) {
 		buf = map[0]->buf;
 
@@ -214,29 +164,33 @@ int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova,
 				pr_warn("%s: Unable to get virtual address\n",
 						__func__);
 				err = -ENOMEM;
-				goto err_release_umem;
+				goto err_cleanup_map;
 			}
 
 			buf->addr = (uintptr_t)vaddr;
 			buf->size = PAGE_SIZE;
 			num_buf++;
 			buf++;
+
 		}
 	}
 
 	mr->ibmr.pd = &pd->ibpd;
 	mr->umem = umem;
 	mr->access = access;
+	mr->length = length;
+	mr->iova = iova;
+	mr->va = start;
+	mr->offset = ib_umem_offset(umem);
 	mr->state = RXE_MR_STATE_VALID;
 	mr->type = IB_MR_TYPE_USER;
 
-	set->length = length;
-	set->iova = iova;
-	set->va = start;
-	set->offset = ib_umem_offset(umem);
-
 	return 0;
 
+err_cleanup_map:
+	for (i = 0; i < mr->num_map; i++)
+		kfree(mr->map[i]);
+	kfree(mr->map);
 err_release_umem:
 	ib_umem_release(umem);
 err_out:
@@ -250,7 +204,7 @@ int rxe_mr_init_fast(struct rxe_pd *pd, int max_pages, struct rxe_mr *mr)
 	/* always allow remote access for FMRs */
 	rxe_mr_init(IB_ACCESS_REMOTE, mr);
 
-	err = rxe_mr_alloc(mr, max_pages, 1);
+	err = rxe_mr_alloc(mr, max_pages);
 	if (err)
 		goto err1;
 
@@ -268,24 +222,21 @@ err1:
 static void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out,
 			size_t *offset_out)
 {
-	struct rxe_map_set *set = mr->cur_map_set;
-	size_t offset = iova - set->iova + set->offset;
+	size_t offset = iova - mr->iova + mr->offset;
 	int			map_index;
 	int			buf_index;
 	u64			length;
-	struct rxe_map *map;
 
-	if (likely(set->page_shift)) {
-		*offset_out = offset & set->page_mask;
-		offset >>= set->page_shift;
+	if (likely(mr->page_shift)) {
+		*offset_out = offset & mr->page_mask;
+		offset >>= mr->page_shift;
 		*n_out = offset & mr->map_mask;
 		*m_out = offset >> mr->map_shift;
 	} else {
 		map_index = 0;
 		buf_index = 0;
 
-		map = set->map[map_index];
-		length = map->buf[buf_index].size;
+		length = mr->map[map_index]->buf[buf_index].size;
 
 		while (offset >= length) {
 			offset -= length;
@@ -295,8 +246,7 @@ static void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out,
 				map_index++;
 				buf_index = 0;
 			}
-			map = set->map[map_index];
-			length = map->buf[buf_index].size;
+			length = mr->map[map_index]->buf[buf_index].size;
 		}
 
 		*m_out = map_index;
@@ -317,7 +267,7 @@ void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length)
 		goto out;
 	}
 
-	if (!mr->cur_map_set) {
+	if (!mr->map) {
 		addr = (void *)(uintptr_t)iova;
 		goto out;
 	}
@@ -330,13 +280,13 @@ void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length)
 
 	lookup_iova(mr, iova, &m, &n, &offset);
 
-	if (offset + length > mr->cur_map_set->map[m]->buf[n].size) {
+	if (offset + length > mr->map[m]->buf[n].size) {
 		pr_warn("crosses page boundary\n");
 		addr = NULL;
 		goto out;
 	}
 
-	addr = (void *)(uintptr_t)mr->cur_map_set->map[m]->buf[n].addr + offset;
+	addr = (void *)(uintptr_t)mr->map[m]->buf[n].addr + offset;
 
 out:
 	return addr;
@@ -372,7 +322,7 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
 		return 0;
 	}
 
-	WARN_ON_ONCE(!mr->cur_map_set);
+	WARN_ON_ONCE(!mr->map);
 
 	err = mr_check_range(mr, iova, length);
 	if (err) {
@@ -382,7 +332,7 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
 
 	lookup_iova(mr, iova, &m, &i, &offset);
 
-	map = mr->cur_map_set->map + m;
+	map = mr->map + m;
 	buf	= map[0]->buf + i;
 
 	while (length > 0) {
@@ -576,22 +526,22 @@ struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key,
 	return mr;
 }
 
-int rxe_invalidate_mr(struct rxe_qp *qp, u32 rkey)
+int rxe_invalidate_mr(struct rxe_qp *qp, u32 key)
 {
 	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
 	struct rxe_mr *mr;
 	int ret;
 
-	mr = rxe_pool_get_index(&rxe->mr_pool, rkey >> 8);
+	mr = rxe_pool_get_index(&rxe->mr_pool, key >> 8);
 	if (!mr) {
-		pr_err("%s: No MR for rkey %#x\n", __func__, rkey);
+		pr_err("%s: No MR for key %#x\n", __func__, key);
 		ret = -EINVAL;
 		goto err;
 	}
 
-	if (rkey != mr->rkey) {
-		pr_err("%s: rkey (%#x) doesn't match mr->rkey (%#x)\n",
-			__func__, rkey, mr->rkey);
+	if (mr->rkey ? (key != mr->rkey) : (key != mr->lkey)) {
+		pr_err("%s: wr key (%#x) doesn't match mr key (%#x)\n",
+			__func__, key, (mr->rkey ? mr->rkey : mr->lkey));
 		ret = -EINVAL;
 		goto err_drop_ref;
 	}
@@ -628,9 +578,8 @@ err:
 int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
 {
 	struct rxe_mr *mr = to_rmr(wqe->wr.wr.reg.mr);
-	u32 key = wqe->wr.wr.reg.key & 0xff;
+	u32 key = wqe->wr.wr.reg.key;
 	u32 access = wqe->wr.wr.reg.access;
-	struct rxe_map_set *set;
 
 	/* user can only register MR in free state */
 	if (unlikely(mr->state != RXE_MR_STATE_FREE)) {
@@ -646,36 +595,19 @@ int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
 		return -EINVAL;
 	}
 
+	/* user is only allowed to change key portion of l/rkey */
+	if (unlikely((mr->lkey & ~0xff) != (key & ~0xff))) {
+		pr_warn("%s: key = 0x%x has wrong index mr->lkey = 0x%x\n",
+			__func__, key, mr->lkey);
+		return -EINVAL;
+	}
+
 	mr->access = access;
-	mr->lkey = (mr->lkey & ~0xff) | key;
-	mr->rkey = (access & IB_ACCESS_REMOTE) ? mr->lkey : 0;
+	mr->lkey = key;
+	mr->rkey = (access & IB_ACCESS_REMOTE) ? key : 0;
+	mr->iova = wqe->wr.wr.reg.mr->iova;
 	mr->state = RXE_MR_STATE_VALID;
 
-	set = mr->cur_map_set;
-	mr->cur_map_set = mr->next_map_set;
-	mr->cur_map_set->iova = wqe->wr.wr.reg.mr->iova;
-	mr->next_map_set = set;
-
-	return 0;
-}
-
-int rxe_mr_set_page(struct ib_mr *ibmr, u64 addr)
-{
-	struct rxe_mr *mr = to_rmr(ibmr);
-	struct rxe_map_set *set = mr->next_map_set;
-	struct rxe_map *map;
-	struct rxe_phys_buf *buf;
-
-	if (unlikely(set->nbuf == mr->num_buf))
-		return -ENOMEM;
-
-	map = set->map[set->nbuf / RXE_BUF_PER_MAP];
-	buf = &map->buf[set->nbuf % RXE_BUF_PER_MAP];
-
-	buf->addr = addr;
-	buf->size = ibmr->page_size;
-	set->nbuf++;
-
 	return 0;
 }
 
@@ -687,7 +619,7 @@ int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 	if (atomic_read(&mr->num_mw) > 0)
 		return -EINVAL;
 
-	rxe_put(mr);
+	rxe_cleanup(mr);
 
 	return 0;
 }
@@ -695,14 +627,15 @@ int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 void rxe_mr_cleanup(struct rxe_pool_elem *elem)
 {
 	struct rxe_mr *mr = container_of(elem, typeof(*mr), elem);
+	int i;
 
 	rxe_put(mr_pd(mr));
-
 	ib_umem_release(mr->umem);
 
-	if (mr->cur_map_set)
-		rxe_mr_free_map_set(mr->num_map, mr->cur_map_set);
+	if (mr->map) {
+		for (i = 0; i < mr->num_map; i++)
+			kfree(mr->map[i]);
 
-	if (mr->next_map_set)
-		rxe_mr_free_map_set(mr->num_map, mr->next_map_set);
+		kfree(mr->map);
+	}
 }
diff --git a/drivers/infiniband/sw/rxe/rxe_mw.c b/drivers/infiniband/sw/rxe/rxe_mw.c
index 2e1fa844fabf..104993801a80 100644
--- a/drivers/infiniband/sw/rxe/rxe_mw.c
+++ b/drivers/infiniband/sw/rxe/rxe_mw.c
@@ -33,6 +33,8 @@ int rxe_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
 			RXE_MW_STATE_FREE : RXE_MW_STATE_VALID;
 	spin_lock_init(&mw->lock);
 
+	rxe_finalize(mw);
+
 	return 0;
 }
 
@@ -40,7 +42,7 @@ int rxe_dealloc_mw(struct ib_mw *ibmw)
 {
 	struct rxe_mw *mw = to_rmw(ibmw);
 
-	rxe_put(mw);
+	rxe_cleanup(mw);
 
 	return 0;
 }
@@ -48,8 +50,6 @@ int rxe_dealloc_mw(struct ib_mw *ibmw)
 static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
 			 struct rxe_mw *mw, struct rxe_mr *mr)
 {
-	u32 key = wqe->wr.wr.mw.rkey & 0xff;
-
 	if (mw->ibmw.type == IB_MW_TYPE_1) {
 		if (unlikely(mw->state != RXE_MW_STATE_VALID)) {
 			pr_err_once(
@@ -87,11 +87,6 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
 		}
 	}
 
-	if (unlikely(key == (mw->rkey & 0xff))) {
-		pr_err_once("attempt to bind MW with same key\n");
-		return -EINVAL;
-	}
-
 	/* remaining checks only apply to a nonzero MR */
 	if (!mr)
 		return 0;
@@ -113,21 +108,21 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
 		      (IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC)) &&
 		     !(mr->access & IB_ACCESS_LOCAL_WRITE))) {
 		pr_err_once(
-			"attempt to bind an writeable MW to an MR without local write access\n");
+			"attempt to bind an Writable MW to an MR without local write access\n");
 		return -EINVAL;
 	}
 
 	/* C10-75 */
 	if (mw->access & IB_ZERO_BASED) {
-		if (unlikely(wqe->wr.wr.mw.length > mr->cur_map_set->length)) {
+		if (unlikely(wqe->wr.wr.mw.length > mr->length)) {
 			pr_err_once(
 				"attempt to bind a ZB MW outside of the MR\n");
 			return -EINVAL;
 		}
 	} else {
-		if (unlikely((wqe->wr.wr.mw.addr < mr->cur_map_set->iova) ||
+		if (unlikely((wqe->wr.wr.mw.addr < mr->iova) ||
 			     ((wqe->wr.wr.mw.addr + wqe->wr.wr.mw.length) >
-			      (mr->cur_map_set->iova + mr->cur_map_set->length)))) {
+			      (mr->iova + mr->length)))) {
 			pr_err_once(
 				"attempt to bind a VA MW outside of the MR\n");
 			return -EINVAL;
diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h
index 568a7cbd13d4..86c7a8bf3cbb 100644
--- a/drivers/infiniband/sw/rxe/rxe_param.h
+++ b/drivers/infiniband/sw/rxe/rxe_param.h
@@ -105,6 +105,12 @@ enum rxe_device_param {
 	RXE_INFLIGHT_SKBS_PER_QP_HIGH	= 64,
 	RXE_INFLIGHT_SKBS_PER_QP_LOW	= 16,
 
+	/* Max number of interations of each tasklet
+	 * before yielding the cpu to let other
+	 * work make progress
+	 */
+	RXE_MAX_ITERATIONS		= 1024,
+
 	/* Delay before calling arbiter timer */
 	RXE_NSEC_ARB_TIMER_DELAY	= 200,
 
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index 19b14826385b..f50620f5a0a1 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -6,6 +6,7 @@
 
 #include "rxe.h"
 
+#define RXE_POOL_TIMEOUT	(200)
 #define RXE_POOL_ALIGN		(16)
 
 static const struct rxe_type_info {
@@ -136,10 +137,14 @@ void *rxe_alloc(struct rxe_pool *pool)
 	elem->pool = pool;
 	elem->obj = obj;
 	kref_init(&elem->ref_cnt);
+	init_completion(&elem->complete);
 
-	err = xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit,
+	/* allocate index in array but leave pointer as NULL so it
+	 * can't be looked up until rxe_finalize() is called
+	 */
+	err = xa_alloc_cyclic(&pool->xa, &elem->index, NULL, pool->limit,
 			      &pool->next, GFP_KERNEL);
-	if (err)
+	if (err < 0)
 		goto err_free;
 
 	return obj;
@@ -151,9 +156,11 @@ err_cnt:
 	return NULL;
 }
 
-int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem)
+int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem,
+				bool sleepable)
 {
 	int err;
+	gfp_t gfp_flags;
 
 	if (WARN_ON(pool->type == RXE_TYPE_MR))
 		return -EINVAL;
@@ -164,10 +171,19 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem)
 	elem->pool = pool;
 	elem->obj = (u8 *)elem - pool->elem_offset;
 	kref_init(&elem->ref_cnt);
+	init_completion(&elem->complete);
 
-	err = xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit,
-			      &pool->next, GFP_KERNEL);
-	if (err)
+	/* AH objects are unique in that the create_ah verb
+	 * can be called in atomic context. If the create_ah
+	 * call is not sleepable use GFP_ATOMIC.
+	 */
+	gfp_flags = sleepable ? GFP_KERNEL : GFP_ATOMIC;
+
+	if (sleepable)
+		might_sleep();
+	err = xa_alloc_cyclic(&pool->xa, &elem->index, NULL, pool->limit,
+			      &pool->next, gfp_flags);
+	if (err < 0)
 		goto err_cnt;
 
 	return 0;
@@ -181,16 +197,15 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index)
 {
 	struct rxe_pool_elem *elem;
 	struct xarray *xa = &pool->xa;
-	unsigned long flags;
 	void *obj;
 
-	xa_lock_irqsave(xa, flags);
+	rcu_read_lock();
 	elem = xa_load(xa, index);
 	if (elem && kref_get_unless_zero(&elem->ref_cnt))
 		obj = elem->obj;
 	else
 		obj = NULL;
-	xa_unlock_irqrestore(xa, flags);
+	rcu_read_unlock();
 
 	return obj;
 }
@@ -198,17 +213,74 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index)
 static void rxe_elem_release(struct kref *kref)
 {
 	struct rxe_pool_elem *elem = container_of(kref, typeof(*elem), ref_cnt);
-	struct rxe_pool *pool = elem->pool;
 
-	xa_erase(&pool->xa, elem->index);
+	complete(&elem->complete);
+}
+
+int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable)
+{
+	struct rxe_pool *pool = elem->pool;
+	struct xarray *xa = &pool->xa;
+	static int timeout = RXE_POOL_TIMEOUT;
+	int ret, err = 0;
+	void *xa_ret;
+
+	if (sleepable)
+		might_sleep();
+
+	/* erase xarray entry to prevent looking up
+	 * the pool elem from its index
+	 */
+	xa_ret = xa_erase(xa, elem->index);
+	WARN_ON(xa_err(xa_ret));
+
+	/* if this is the last call to rxe_put complete the
+	 * object. It is safe to touch obj->elem after this since
+	 * it is freed below
+	 */
+	__rxe_put(elem);
+
+	/* wait until all references to the object have been
+	 * dropped before final object specific cleanup and
+	 * return to rdma-core
+	 */
+	if (sleepable) {
+		if (!completion_done(&elem->complete) && timeout) {
+			ret = wait_for_completion_timeout(&elem->complete,
+					timeout);
+
+			/* Shouldn't happen. There are still references to
+			 * the object but, rather than deadlock, free the
+			 * object or pass back to rdma-core.
+			 */
+			if (WARN_ON(!ret))
+				err = -EINVAL;
+		}
+	} else {
+		unsigned long until = jiffies + timeout;
+
+		/* AH objects are unique in that the destroy_ah verb
+		 * can be called in atomic context. This delay
+		 * replaces the wait_for_completion call above
+		 * when the destroy_ah call is not sleepable
+		 */
+		while (!completion_done(&elem->complete) &&
+				time_before(jiffies, until))
+			mdelay(1);
+
+		if (WARN_ON(!completion_done(&elem->complete)))
+			err = -EINVAL;
+	}
 
 	if (pool->cleanup)
 		pool->cleanup(elem);
 
 	if (pool->type == RXE_TYPE_MR)
-		kfree(elem->obj);
+		kfree_rcu(elem->obj);
 
 	atomic_dec(&pool->num_elem);
+
+	return err;
 }
 
 int __rxe_get(struct rxe_pool_elem *elem)
@@ -220,3 +292,11 @@ int __rxe_put(struct rxe_pool_elem *elem)
 {
 	return kref_put(&elem->ref_cnt, rxe_elem_release);
 }
+
+void __rxe_finalize(struct rxe_pool_elem *elem)
+{
+	void *xa_ret;
+
+	xa_ret = xa_store(&elem->pool->xa, elem->index, elem, GFP_KERNEL);
+	WARN_ON(xa_err(xa_ret));
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
index 0860660d65ec..9d83cb32092f 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.h
+++ b/drivers/infiniband/sw/rxe/rxe_pool.h
@@ -24,6 +24,7 @@ struct rxe_pool_elem {
 	void			*obj;
 	struct kref		ref_cnt;
 	struct list_head	list;
+	struct completion	complete;
 	u32			index;
 };
 
@@ -57,21 +58,28 @@ void rxe_pool_cleanup(struct rxe_pool *pool);
 void *rxe_alloc(struct rxe_pool *pool);
 
 /* connect already allocated object to pool */
-int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem);
-
-#define rxe_add_to_pool(pool, obj) __rxe_add_to_pool(pool, &(obj)->elem)
+int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem,
+				bool sleepable);
+#define rxe_add_to_pool(pool, obj) __rxe_add_to_pool(pool, &(obj)->elem, true)
+#define rxe_add_to_pool_ah(pool, obj, sleepable) __rxe_add_to_pool(pool, \
+				&(obj)->elem, sleepable)
 
 /* lookup an indexed object from index. takes a reference on object */
 void *rxe_pool_get_index(struct rxe_pool *pool, u32 index);
 
 int __rxe_get(struct rxe_pool_elem *elem);
-
 #define rxe_get(obj) __rxe_get(&(obj)->elem)
 
 int __rxe_put(struct rxe_pool_elem *elem);
-
 #define rxe_put(obj) __rxe_put(&(obj)->elem)
 
+int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable);
+#define rxe_cleanup(obj) __rxe_cleanup(&(obj)->elem, true)
+#define rxe_cleanup_ah(obj, sleepable) __rxe_cleanup(&(obj)->elem, sleepable)
+
 #define rxe_read(obj) kref_read(&(obj)->elem.ref_cnt)
 
+void __rxe_finalize(struct rxe_pool_elem *elem);
+#define rxe_finalize(obj) __rxe_finalize(&(obj)->elem)
+
 #endif /* RXE_POOL_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
index 22e9b85344c3..516bf9b95e48 100644
--- a/drivers/infiniband/sw/rxe/rxe_qp.c
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -120,17 +120,15 @@ static void free_rd_atomic_resources(struct rxe_qp *qp)
 		for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) {
 			struct resp_res *res = &qp->resp.resources[i];
 
-			free_rd_atomic_resource(qp, res);
+			free_rd_atomic_resource(res);
 		}
 		kfree(qp->resp.resources);
 		qp->resp.resources = NULL;
 	}
 }
 
-void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res)
+void free_rd_atomic_resource(struct resp_res *res)
 {
-	if (res->type == RXE_ATOMIC_MASK)
-		kfree_skb(res->atomic.skb);
 	res->type = 0;
 }
 
@@ -142,7 +140,7 @@ static void cleanup_rd_atomic_resources(struct rxe_qp *qp)
 	if (qp->resp.resources) {
 		for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) {
 			res = &qp->resp.resources[i];
-			free_rd_atomic_resource(qp, res);
+			free_rd_atomic_resource(res);
 		}
 	}
 }
@@ -174,6 +172,14 @@ static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp,
 
 	spin_lock_init(&qp->state_lock);
 
+	spin_lock_init(&qp->req.task.state_lock);
+	spin_lock_init(&qp->resp.task.state_lock);
+	spin_lock_init(&qp->comp.task.state_lock);
+
+	spin_lock_init(&qp->sq.sq_lock);
+	spin_lock_init(&qp->rq.producer_lock);
+	spin_lock_init(&qp->rq.consumer_lock);
+
 	atomic_set(&qp->ssn, 0);
 	atomic_set(&qp->skb_out, 0);
 }
@@ -230,10 +236,10 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
 					       QUEUE_TYPE_FROM_CLIENT);
 
 	qp->req.state		= QP_STATE_RESET;
+	qp->comp.state		= QP_STATE_RESET;
 	qp->req.opcode		= -1;
 	qp->comp.opcode		= -1;
 
-	spin_lock_init(&qp->sq.sq_lock);
 	skb_queue_head_init(&qp->req_pkts);
 
 	rxe_init_task(rxe, &qp->req.task, qp,
@@ -284,9 +290,6 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp,
 		}
 	}
 
-	spin_lock_init(&qp->rq.producer_lock);
-	spin_lock_init(&qp->rq.consumer_lock);
-
 	skb_queue_head_init(&qp->resp_pkts);
 
 	rxe_init_task(rxe, &qp->resp.task, qp,
@@ -490,6 +493,7 @@ static void rxe_qp_reset(struct rxe_qp *qp)
 
 	/* move qp to the reset state */
 	qp->req.state = QP_STATE_RESET;
+	qp->comp.state = QP_STATE_RESET;
 	qp->resp.state = QP_STATE_RESET;
 
 	/* let state machines reset themselves drain work and packet queues
@@ -507,6 +511,7 @@ static void rxe_qp_reset(struct rxe_qp *qp)
 	atomic_set(&qp->ssn, 0);
 	qp->req.opcode = -1;
 	qp->req.need_retry = 0;
+	qp->req.wait_for_rnr_timer = 0;
 	qp->req.noack_pkts = 0;
 	qp->resp.msn = 0;
 	qp->resp.opcode = -1;
@@ -552,6 +557,7 @@ void rxe_qp_error(struct rxe_qp *qp)
 {
 	qp->req.state = QP_STATE_ERROR;
 	qp->resp.state = QP_STATE_ERROR;
+	qp->comp.state = QP_STATE_ERROR;
 	qp->attr.qp_state = IB_QPS_ERR;
 
 	/* drain work and packet queues */
@@ -689,6 +695,7 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask,
 			pr_debug("qp#%d state -> INIT\n", qp_num(qp));
 			qp->req.state = QP_STATE_INIT;
 			qp->resp.state = QP_STATE_INIT;
+			qp->comp.state = QP_STATE_INIT;
 			break;
 
 		case IB_QPS_RTR:
@@ -699,6 +706,7 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask,
 		case IB_QPS_RTS:
 			pr_debug("qp#%d state -> RTS\n", qp_num(qp));
 			qp->req.state = QP_STATE_READY;
+			qp->comp.state = QP_STATE_READY;
 			break;
 
 		case IB_QPS_SQD:
@@ -804,13 +812,15 @@ static void rxe_qp_do_cleanup(struct work_struct *work)
 	if (qp->rq.queue)
 		rxe_queue_cleanup(qp->rq.queue);
 
-	atomic_dec(&qp->scq->num_wq);
-	if (qp->scq)
+	if (qp->scq) {
+		atomic_dec(&qp->scq->num_wq);
 		rxe_put(qp->scq);
+	}
 
-	atomic_dec(&qp->rcq->num_wq);
-	if (qp->rcq)
+	if (qp->rcq) {
+		atomic_dec(&qp->rcq->num_wq);
 		rxe_put(qp->rcq);
+	}
 
 	if (qp->pd)
 		rxe_put(qp->pd);
diff --git a/drivers/infiniband/sw/rxe/rxe_queue.h b/drivers/infiniband/sw/rxe/rxe_queue.h
index 6227112ef7a2..ed44042782fa 100644
--- a/drivers/infiniband/sw/rxe/rxe_queue.h
+++ b/drivers/infiniband/sw/rxe/rxe_queue.h
@@ -7,9 +7,6 @@
 #ifndef RXE_QUEUE_H
 #define RXE_QUEUE_H
 
-/* for definition of shared struct rxe_queue_buf */
-#include <uapi/rdma/rdma_user_rxe.h>
-
 /* Implements a simple circular buffer that is shared between user
  * and the driver and can be resized. The requested element size is
  * rounded up to a power of 2 and the number of elements in the buffer
@@ -53,6 +50,8 @@ enum queue_type {
 	QUEUE_TYPE_FROM_DRIVER,
 };
 
+struct rxe_queue_buf;
+
 struct rxe_queue {
 	struct rxe_dev		*rxe;
 	struct rxe_queue_buf	*buf;
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
index 9d98237389cf..f63771207970 100644
--- a/drivers/infiniband/sw/rxe/rxe_req.c
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -15,8 +15,7 @@ static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
 		       u32 opcode);
 
 static inline void retry_first_write_send(struct rxe_qp *qp,
-					  struct rxe_send_wqe *wqe,
-					  unsigned int mask, int npsn)
+					  struct rxe_send_wqe *wqe, int npsn)
 {
 	int i;
 
@@ -83,7 +82,7 @@ static void req_retry(struct rxe_qp *qp)
 			if (mask & WR_WRITE_OR_SEND_MASK) {
 				npsn = (qp->comp.psn - wqe->first_psn) &
 					BTH_PSN_MASK;
-				retry_first_write_send(qp, wqe, mask, npsn);
+				retry_first_write_send(qp, wqe, npsn);
 			}
 
 			if (mask & WR_READ_MASK) {
@@ -101,7 +100,11 @@ void rnr_nak_timer(struct timer_list *t)
 {
 	struct rxe_qp *qp = from_timer(qp, t, rnr_nak_timer);
 
-	pr_debug("qp#%d rnr nak timer fired\n", qp_num(qp));
+	pr_debug("%s: fired for qp#%d\n", __func__, qp_num(qp));
+
+	/* request a send queue retry */
+	qp->req.need_retry = 1;
+	qp->req.wait_for_rnr_timer = 0;
 	rxe_run_task(&qp->req.task, 1);
 }
 
@@ -161,16 +164,36 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp)
 		     (wqe->state != wqe_state_processing)))
 		return NULL;
 
-	if (unlikely((wqe->wr.send_flags & IB_SEND_FENCE) &&
-						     (index != cons))) {
-		qp->req.wait_fence = 1;
-		return NULL;
-	}
-
 	wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp);
 	return wqe;
 }
 
+/**
+ * rxe_wqe_is_fenced - check if next wqe is fenced
+ * @qp: the queue pair
+ * @wqe: the next wqe
+ *
+ * Returns: 1 if wqe needs to wait
+ *	    0 if wqe is ready to go
+ */
+static int rxe_wqe_is_fenced(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+	/* Local invalidate fence (LIF) see IBA 10.6.5.1
+	 * Requires ALL previous operations on the send queue
+	 * are complete. Make mandatory for the rxe driver.
+	 */
+	if (wqe->wr.opcode == IB_WR_LOCAL_INV)
+		return qp->req.wqe_index != queue_get_consumer(qp->sq.queue,
+						QUEUE_TYPE_FROM_CLIENT);
+
+	/* Fence see IBA 10.8.3.3
+	 * Requires that all previous read and atomic operations
+	 * are complete.
+	 */
+	return (wqe->wr.send_flags & IB_SEND_FENCE) &&
+		atomic_read(&qp->req.rd_atomic) != qp->attr.max_rd_atomic;
+}
+
 static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits)
 {
 	switch (opcode) {
@@ -581,9 +604,11 @@ static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
 	wqe->status = IB_WC_SUCCESS;
 	qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index);
 
-	if ((wqe->wr.send_flags & IB_SEND_SIGNALED) ||
-	    qp->sq_sig_type == IB_SIGNAL_ALL_WR)
-		rxe_run_task(&qp->comp.task, 1);
+	/* There is no ack coming for local work requests
+	 * which can lead to a deadlock. So go ahead and complete
+	 * it now.
+	 */
+	rxe_run_task(&qp->comp.task, 1);
 
 	return 0;
 }
@@ -599,6 +624,7 @@ int rxe_requester(void *arg)
 	u32 payload;
 	int mtu;
 	int opcode;
+	int err;
 	int ret;
 	struct rxe_send_wqe rollback_wqe;
 	u32 rollback_psn;
@@ -609,10 +635,20 @@ int rxe_requester(void *arg)
 	if (!rxe_get(qp))
 		return -EAGAIN;
 
-next_wqe:
-	if (unlikely(!qp->valid || qp->req.state == QP_STATE_ERROR))
+	if (unlikely(!qp->valid))
 		goto exit;
 
+	if (unlikely(qp->req.state == QP_STATE_ERROR)) {
+		wqe = req_next_wqe(qp);
+		if (wqe)
+			/*
+			 * Generate an error completion for error qp state
+			 */
+			goto err;
+		else
+			goto exit;
+	}
+
 	if (unlikely(qp->req.state == QP_STATE_RESET)) {
 		qp->req.wqe_index = queue_get_consumer(q,
 						QUEUE_TYPE_FROM_CLIENT);
@@ -620,10 +656,17 @@ next_wqe:
 		qp->req.need_rd_atomic = 0;
 		qp->req.wait_psn = 0;
 		qp->req.need_retry = 0;
+		qp->req.wait_for_rnr_timer = 0;
 		goto exit;
 	}
 
-	if (unlikely(qp->req.need_retry)) {
+	/* we come here if the retransmit timer has fired
+	 * or if the rnr timer has fired. If the retransmit
+	 * timer fires while we are processing an RNR NAK wait
+	 * until the rnr timer has fired before starting the
+	 * retry flow
+	 */
+	if (unlikely(qp->req.need_retry && !qp->req.wait_for_rnr_timer)) {
 		req_retry(qp);
 		qp->req.need_retry = 0;
 	}
@@ -632,12 +675,17 @@ next_wqe:
 	if (unlikely(!wqe))
 		goto exit;
 
+	if (rxe_wqe_is_fenced(qp, wqe)) {
+		qp->req.wait_fence = 1;
+		goto exit;
+	}
+
 	if (wqe->mask & WR_LOCAL_OP_MASK) {
-		ret = rxe_do_local_ops(qp, wqe);
-		if (unlikely(ret))
+		err = rxe_do_local_ops(qp, wqe);
+		if (unlikely(err))
 			goto err;
 		else
-			goto next_wqe;
+			goto done;
 	}
 
 	if (unlikely(qp_type(qp) == IB_QPT_RC &&
@@ -685,9 +733,8 @@ next_wqe:
 						       qp->req.wqe_index);
 			wqe->state = wqe_state_done;
 			wqe->status = IB_WC_SUCCESS;
-			__rxe_do_task(&qp->comp.task);
-			rxe_put(qp);
-			return 0;
+			rxe_run_task(&qp->comp.task, 0);
+			goto done;
 		}
 		payload = mtu;
 	}
@@ -703,25 +750,29 @@ next_wqe:
 	if (unlikely(!av)) {
 		pr_err("qp#%d Failed no address vector\n", qp_num(qp));
 		wqe->status = IB_WC_LOC_QP_OP_ERR;
-		goto err_drop_ah;
+		goto err;
 	}
 
 	skb = init_req_packet(qp, av, wqe, opcode, payload, &pkt);
 	if (unlikely(!skb)) {
 		pr_err("qp#%d Failed allocating skb\n", qp_num(qp));
 		wqe->status = IB_WC_LOC_QP_OP_ERR;
-		goto err_drop_ah;
+		if (ah)
+			rxe_put(ah);
+		goto err;
 	}
 
-	ret = finish_packet(qp, av, wqe, &pkt, skb, payload);
-	if (unlikely(ret)) {
+	err = finish_packet(qp, av, wqe, &pkt, skb, payload);
+	if (unlikely(err)) {
 		pr_debug("qp#%d Error during finish packet\n", qp_num(qp));
-		if (ret == -EFAULT)
+		if (err == -EFAULT)
 			wqe->status = IB_WC_LOC_PROT_ERR;
 		else
 			wqe->status = IB_WC_LOC_QP_OP_ERR;
 		kfree_skb(skb);
-		goto err_drop_ah;
+		if (ah)
+			rxe_put(ah);
+		goto err;
 	}
 
 	if (ah)
@@ -736,13 +787,14 @@ next_wqe:
 	save_state(wqe, qp, &rollback_wqe, &rollback_psn);
 	update_wqe_state(qp, wqe, &pkt);
 	update_wqe_psn(qp, wqe, &pkt, payload);
-	ret = rxe_xmit_packet(qp, &pkt, skb);
-	if (ret) {
+
+	err = rxe_xmit_packet(qp, &pkt, skb);
+	if (err) {
 		qp->need_req_skb = 1;
 
 		rollback_state(wqe, qp, &rollback_wqe, rollback_psn);
 
-		if (ret == -EAGAIN) {
+		if (err == -EAGAIN) {
 			rxe_run_task(&qp->req.task, 1);
 			goto exit;
 		}
@@ -753,16 +805,23 @@ next_wqe:
 
 	update_state(qp, &pkt);
 
-	goto next_wqe;
-
-err_drop_ah:
-	if (ah)
-		rxe_put(ah);
+	/* A non-zero return value will cause rxe_do_task to
+	 * exit its loop and end the tasklet. A zero return
+	 * will continue looping and return to rxe_requester
+	 */
+done:
+	ret = 0;
+	goto out;
 err:
+	/* update wqe_index for each wqe completion */
+	qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index);
 	wqe->state = wqe_state_error;
-	__rxe_do_task(&qp->comp.task);
-
+	qp->req.state = QP_STATE_ERROR;
+	rxe_run_task(&qp->comp.task, 0);
 exit:
+	ret = -EAGAIN;
+out:
 	rxe_put(qp);
-	return -EAGAIN;
+
+	return ret;
 }
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index f4f6ee5d81fe..b36ec5c4d5e0 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -21,6 +21,7 @@ enum resp_states {
 	RESPST_CHK_RKEY,
 	RESPST_EXECUTE,
 	RESPST_READ_REPLY,
+	RESPST_ATOMIC_REPLY,
 	RESPST_COMPLETE,
 	RESPST_ACKNOWLEDGE,
 	RESPST_CLEANUP,
@@ -55,6 +56,7 @@ static char *resp_state_name[] = {
 	[RESPST_CHK_RKEY]			= "CHK_RKEY",
 	[RESPST_EXECUTE]			= "EXECUTE",
 	[RESPST_READ_REPLY]			= "READ_REPLY",
+	[RESPST_ATOMIC_REPLY]			= "ATOMIC_REPLY",
 	[RESPST_COMPLETE]			= "COMPLETE",
 	[RESPST_ACKNOWLEDGE]			= "ACKNOWLEDGE",
 	[RESPST_CLEANUP]			= "CLEANUP",
@@ -448,7 +450,8 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
 	if (rkey_is_mw(rkey)) {
 		mw = rxe_lookup_mw(qp, access, rkey);
 		if (!mw) {
-			pr_err("%s: no MW matches rkey %#x\n", __func__, rkey);
+			pr_debug("%s: no MW matches rkey %#x\n",
+					__func__, rkey);
 			state = RESPST_ERR_RKEY_VIOLATION;
 			goto err;
 		}
@@ -468,7 +471,8 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
 	} else {
 		mr = lookup_mr(qp->pd, access, rkey, RXE_LOOKUP_REMOTE);
 		if (!mr) {
-			pr_err("%s: no MR matches rkey %#x\n", __func__, rkey);
+			pr_debug("%s: no MR matches rkey %#x\n",
+					__func__, rkey);
 			state = RESPST_ERR_RKEY_VIOLATION;
 			goto err;
 		}
@@ -549,49 +553,106 @@ out:
 	return rc;
 }
 
+static struct resp_res *rxe_prepare_res(struct rxe_qp *qp,
+					struct rxe_pkt_info *pkt,
+					int type)
+{
+	struct resp_res *res;
+	u32 pkts;
+
+	res = &qp->resp.resources[qp->resp.res_head];
+	rxe_advance_resp_resource(qp);
+	free_rd_atomic_resource(res);
+
+	res->type = type;
+	res->replay = 0;
+
+	switch (type) {
+	case RXE_READ_MASK:
+		res->read.va = qp->resp.va + qp->resp.offset;
+		res->read.va_org = qp->resp.va + qp->resp.offset;
+		res->read.resid = qp->resp.resid;
+		res->read.length = qp->resp.resid;
+		res->read.rkey = qp->resp.rkey;
+
+		pkts = max_t(u32, (reth_len(pkt) + qp->mtu - 1)/qp->mtu, 1);
+		res->first_psn = pkt->psn;
+		res->cur_psn = pkt->psn;
+		res->last_psn = (pkt->psn + pkts - 1) & BTH_PSN_MASK;
+
+		res->state = rdatm_res_state_new;
+		break;
+	case RXE_ATOMIC_MASK:
+		res->first_psn = pkt->psn;
+		res->last_psn = pkt->psn;
+		res->cur_psn = pkt->psn;
+		break;
+	}
+
+	return res;
+}
+
 /* Guarantee atomicity of atomic operations at the machine level. */
 static DEFINE_SPINLOCK(atomic_ops_lock);
 
-static enum resp_states process_atomic(struct rxe_qp *qp,
-				       struct rxe_pkt_info *pkt)
+static enum resp_states atomic_reply(struct rxe_qp *qp,
+					 struct rxe_pkt_info *pkt)
 {
 	u64 *vaddr;
 	enum resp_states ret;
 	struct rxe_mr *mr = qp->resp.mr;
+	struct resp_res *res = qp->resp.res;
+	u64 value;
 
-	if (mr->state != RXE_MR_STATE_VALID) {
-		ret = RESPST_ERR_RKEY_VIOLATION;
-		goto out;
+	if (!res) {
+		res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_MASK);
+		qp->resp.res = res;
 	}
 
-	vaddr = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, sizeof(u64));
+	if (!res->replay) {
+		if (mr->state != RXE_MR_STATE_VALID) {
+			ret = RESPST_ERR_RKEY_VIOLATION;
+			goto out;
+		}
 
-	/* check vaddr is 8 bytes aligned. */
-	if (!vaddr || (uintptr_t)vaddr & 7) {
-		ret = RESPST_ERR_MISALIGNED_ATOMIC;
-		goto out;
+		vaddr = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset,
+					sizeof(u64));
+
+		/* check vaddr is 8 bytes aligned. */
+		if (!vaddr || (uintptr_t)vaddr & 7) {
+			ret = RESPST_ERR_MISALIGNED_ATOMIC;
+			goto out;
+		}
+
+		spin_lock_bh(&atomic_ops_lock);
+		res->atomic.orig_val = value = *vaddr;
+
+		if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP) {
+			if (value == atmeth_comp(pkt))
+				value = atmeth_swap_add(pkt);
+		} else {
+			value += atmeth_swap_add(pkt);
+		}
+
+		*vaddr = value;
+		spin_unlock_bh(&atomic_ops_lock);
+
+		qp->resp.msn++;
+
+		/* next expected psn, read handles this separately */
+		qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+		qp->resp.ack_psn = qp->resp.psn;
+
+		qp->resp.opcode = pkt->opcode;
+		qp->resp.status = IB_WC_SUCCESS;
 	}
 
-	spin_lock_bh(&atomic_ops_lock);
-
-	qp->resp.atomic_orig = *vaddr;
-
-	if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP) {
-		if (*vaddr == atmeth_comp(pkt))
-			*vaddr = atmeth_swap_add(pkt);
-	} else {
-		*vaddr += atmeth_swap_add(pkt);
-	}
-
-	spin_unlock_bh(&atomic_ops_lock);
-
-	ret = RESPST_NONE;
+	ret = RESPST_ACKNOWLEDGE;
 out:
 	return ret;
 }
 
 static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp,
-					  struct rxe_pkt_info *pkt,
 					  struct rxe_pkt_info *ack,
 					  int opcode,
 					  int payload,
@@ -629,7 +690,7 @@ static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp,
 	}
 
 	if (ack->mask & RXE_ATMACK_MASK)
-		atmack_set_orig(ack, qp->resp.atomic_orig);
+		atmack_set_orig(ack, qp->resp.res->atomic.orig_val);
 
 	err = rxe_prepare(&qp->pri_av, ack, skb);
 	if (err) {
@@ -640,34 +701,6 @@ static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp,
 	return skb;
 }
 
-static struct resp_res *rxe_prepare_read_res(struct rxe_qp *qp,
-					struct rxe_pkt_info *pkt)
-{
-	struct resp_res *res;
-	u32 pkts;
-
-	res = &qp->resp.resources[qp->resp.res_head];
-	rxe_advance_resp_resource(qp);
-	free_rd_atomic_resource(qp, res);
-
-	res->type = RXE_READ_MASK;
-	res->replay = 0;
-	res->read.va = qp->resp.va + qp->resp.offset;
-	res->read.va_org = qp->resp.va + qp->resp.offset;
-	res->read.resid = qp->resp.resid;
-	res->read.length = qp->resp.resid;
-	res->read.rkey = qp->resp.rkey;
-
-	pkts = max_t(u32, (reth_len(pkt) + qp->mtu - 1)/qp->mtu, 1);
-	res->first_psn = pkt->psn;
-	res->cur_psn = pkt->psn;
-	res->last_psn = (pkt->psn + pkts - 1) & BTH_PSN_MASK;
-
-	res->state = rdatm_res_state_new;
-
-	return res;
-}
-
 /**
  * rxe_recheck_mr - revalidate MR from rkey and get a reference
  * @qp: the qp
@@ -738,7 +771,7 @@ static enum resp_states read_reply(struct rxe_qp *qp,
 	struct rxe_mr *mr;
 
 	if (!res) {
-		res = rxe_prepare_read_res(qp, req_pkt);
+		res = rxe_prepare_res(qp, req_pkt, RXE_READ_MASK);
 		qp->resp.res = res;
 	}
 
@@ -771,7 +804,7 @@ static enum resp_states read_reply(struct rxe_qp *qp,
 
 	payload = min_t(int, res->read.resid, mtu);
 
-	skb = prepare_ack_packet(qp, req_pkt, &ack_pkt, opcode, payload,
+	skb = prepare_ack_packet(qp, &ack_pkt, opcode, payload,
 				 res->cur_psn, AETH_ACK_UNLIMITED);
 	if (!skb)
 		return RESPST_ERR_RNR;
@@ -858,9 +891,7 @@ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt)
 		qp->resp.msn++;
 		return RESPST_READ_REPLY;
 	} else if (pkt->mask & RXE_ATOMIC_MASK) {
-		err = process_atomic(qp, pkt);
-		if (err)
-			return err;
+		return RESPST_ATOMIC_REPLY;
 	} else {
 		/* Unreachable */
 		WARN_ON_ONCE(1);
@@ -997,14 +1028,13 @@ finish:
 		return RESPST_CLEANUP;
 }
 
-static int send_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
-		    u8 syndrome, u32 psn)
+static int send_ack(struct rxe_qp *qp, u8 syndrome, u32 psn)
 {
 	int err = 0;
 	struct rxe_pkt_info ack_pkt;
 	struct sk_buff *skb;
 
-	skb = prepare_ack_packet(qp, pkt, &ack_pkt, IB_OPCODE_RC_ACKNOWLEDGE,
+	skb = prepare_ack_packet(qp, &ack_pkt, IB_OPCODE_RC_ACKNOWLEDGE,
 				 0, psn, syndrome);
 	if (!skb) {
 		err = -ENOMEM;
@@ -1019,40 +1049,29 @@ err1:
 	return err;
 }
 
-static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
-			   u8 syndrome)
+static int send_atomic_ack(struct rxe_qp *qp, u8 syndrome, u32 psn)
 {
-	int rc = 0;
+	int err = 0;
 	struct rxe_pkt_info ack_pkt;
 	struct sk_buff *skb;
-	struct resp_res *res;
 
-	skb = prepare_ack_packet(qp, pkt, &ack_pkt,
-				 IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, 0, pkt->psn,
-				 syndrome);
+	skb = prepare_ack_packet(qp, &ack_pkt, IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE,
+				 0, psn, syndrome);
 	if (!skb) {
-		rc = -ENOMEM;
+		err = -ENOMEM;
 		goto out;
 	}
 
-	res = &qp->resp.resources[qp->resp.res_head];
-	free_rd_atomic_resource(qp, res);
-	rxe_advance_resp_resource(qp);
+	err = rxe_xmit_packet(qp, &ack_pkt, skb);
+	if (err)
+		pr_err_ratelimited("Failed sending atomic ack\n");
 
-	skb_get(skb);
-	res->type = RXE_ATOMIC_MASK;
-	res->atomic.skb = skb;
-	res->first_psn = ack_pkt.psn;
-	res->last_psn  = ack_pkt.psn;
-	res->cur_psn   = ack_pkt.psn;
-
-	rc = rxe_xmit_packet(qp, &ack_pkt, skb);
-	if (rc) {
-		pr_err_ratelimited("Failed sending ack\n");
-		rxe_put(qp);
-	}
+	/* have to clear this since it is used to trigger
+	 * long read replies
+	 */
+	qp->resp.res = NULL;
 out:
-	return rc;
+	return err;
 }
 
 static enum resp_states acknowledge(struct rxe_qp *qp,
@@ -1062,11 +1081,11 @@ static enum resp_states acknowledge(struct rxe_qp *qp,
 		return RESPST_CLEANUP;
 
 	if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED)
-		send_ack(qp, pkt, qp->resp.aeth_syndrome, pkt->psn);
+		send_ack(qp, qp->resp.aeth_syndrome, pkt->psn);
 	else if (pkt->mask & RXE_ATOMIC_MASK)
-		send_atomic_ack(qp, pkt, AETH_ACK_UNLIMITED);
+		send_atomic_ack(qp, AETH_ACK_UNLIMITED, pkt->psn);
 	else if (bth_ack(pkt))
-		send_ack(qp, pkt, AETH_ACK_UNLIMITED, pkt->psn);
+		send_ack(qp, AETH_ACK_UNLIMITED, pkt->psn);
 
 	return RESPST_CLEANUP;
 }
@@ -1119,7 +1138,7 @@ static enum resp_states duplicate_request(struct rxe_qp *qp,
 	if (pkt->mask & RXE_SEND_MASK ||
 	    pkt->mask & RXE_WRITE_MASK) {
 		/* SEND. Ack again and cleanup. C9-105. */
-		send_ack(qp, pkt, AETH_ACK_UNLIMITED, prev_psn);
+		send_ack(qp, AETH_ACK_UNLIMITED, prev_psn);
 		return RESPST_CLEANUP;
 	} else if (pkt->mask & RXE_READ_MASK) {
 		struct resp_res *res;
@@ -1173,14 +1192,11 @@ static enum resp_states duplicate_request(struct rxe_qp *qp,
 		/* Find the operation in our list of responder resources. */
 		res = find_resource(qp, pkt->psn);
 		if (res) {
-			skb_get(res->atomic.skb);
-			/* Resend the result. */
-			rc = rxe_xmit_packet(qp, pkt, res->atomic.skb);
-			if (rc) {
-				pr_err("Failed resending result. This flow is not handled - skb ignored\n");
-				rc = RESPST_CLEANUP;
-				goto out;
-			}
+			res->replay = 1;
+			res->cur_psn = pkt->psn;
+			qp->resp.res = res;
+			rc = RESPST_ATOMIC_REPLY;
+			goto out;
 		}
 
 		/* Resource not found. Class D error. Drop the request. */
@@ -1260,17 +1276,15 @@ int rxe_responder(void *arg)
 	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
 	enum resp_states state;
 	struct rxe_pkt_info *pkt = NULL;
-	int ret = 0;
+	int ret;
 
 	if (!rxe_get(qp))
 		return -EAGAIN;
 
 	qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED;
 
-	if (!qp->valid) {
-		ret = -EINVAL;
-		goto done;
-	}
+	if (!qp->valid)
+		goto exit;
 
 	switch (qp->resp.state) {
 	case QP_STATE_RESET:
@@ -1316,6 +1330,9 @@ int rxe_responder(void *arg)
 		case RESPST_READ_REPLY:
 			state = read_reply(qp, pkt);
 			break;
+		case RESPST_ATOMIC_REPLY:
+			state = atomic_reply(qp, pkt);
+			break;
 		case RESPST_ACKNOWLEDGE:
 			state = acknowledge(qp, pkt);
 			break;
@@ -1327,7 +1344,7 @@ int rxe_responder(void *arg)
 			break;
 		case RESPST_ERR_PSN_OUT_OF_SEQ:
 			/* RC only - Class B. Drop packet. */
-			send_ack(qp, pkt, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn);
+			send_ack(qp, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn);
 			state = RESPST_CLEANUP;
 			break;
 
@@ -1349,7 +1366,7 @@ int rxe_responder(void *arg)
 			if (qp_type(qp) == IB_QPT_RC) {
 				rxe_counter_inc(rxe, RXE_CNT_SND_RNR);
 				/* RC - class B */
-				send_ack(qp, pkt, AETH_RNR_NAK |
+				send_ack(qp, AETH_RNR_NAK |
 					 (~AETH_TYPE_MASK &
 					 qp->attr.min_rnr_timer),
 					 pkt->psn);
@@ -1438,7 +1455,7 @@ int rxe_responder(void *arg)
 
 		case RESPST_ERROR:
 			qp->resp.goto_error = 0;
-			pr_warn("qp#%d moved to error state\n", qp_num(qp));
+			pr_debug("qp#%d moved to error state\n", qp_num(qp));
 			rxe_qp_error(qp);
 			goto exit;
 
@@ -1447,9 +1464,16 @@ int rxe_responder(void *arg)
 		}
 	}
 
+	/* A non-zero return value will cause rxe_do_task to
+	 * exit its loop and end the tasklet. A zero return
+	 * will continue looping and return to rxe_responder
+	 */
+done:
+	ret = 0;
+	goto out;
 exit:
 	ret = -EAGAIN;
-done:
+out:
 	rxe_put(qp);
 	return ret;
 }
diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c
index 0c4db5bb17d7..2248cf33d776 100644
--- a/drivers/infiniband/sw/rxe/rxe_task.c
+++ b/drivers/infiniband/sw/rxe/rxe_task.c
@@ -8,7 +8,7 @@
 #include <linux/interrupt.h>
 #include <linux/hardirq.h>
 
-#include "rxe_task.h"
+#include "rxe.h"
 
 int __rxe_do_task(struct rxe_task *task)
 
@@ -33,6 +33,7 @@ void rxe_do_task(struct tasklet_struct *t)
 	int cont;
 	int ret;
 	struct rxe_task *task = from_tasklet(task, t, tasklet);
+	unsigned int iterations = RXE_MAX_ITERATIONS;
 
 	spin_lock_bh(&task->state_lock);
 	switch (task->state) {
@@ -61,13 +62,20 @@ void rxe_do_task(struct tasklet_struct *t)
 		spin_lock_bh(&task->state_lock);
 		switch (task->state) {
 		case TASK_STATE_BUSY:
-			if (ret)
+			if (ret) {
 				task->state = TASK_STATE_START;
-			else
+			} else if (iterations--) {
 				cont = 1;
+			} else {
+				/* reschedule the tasklet and exit
+				 * the loop to give up the cpu
+				 */
+				tasklet_schedule(&task->tasklet);
+				task->state = TASK_STATE_START;
+			}
 			break;
 
-		/* soneone tried to run the task since the last time we called
+		/* someone tried to run the task since the last time we called
 		 * func, so we will call one more time regardless of the
 		 * return value
 		 */
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 9d995854a174..e264cf69bf55 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -115,7 +115,7 @@ static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc)
 {
 	struct rxe_ucontext *uc = to_ruc(ibuc);
 
-	rxe_put(uc);
+	rxe_cleanup(uc);
 }
 
 static int rxe_port_immutable(struct ib_device *dev, u32 port_num,
@@ -149,7 +149,7 @@ static int rxe_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
 	struct rxe_pd *pd = to_rpd(ibpd);
 
-	rxe_put(pd);
+	rxe_cleanup(pd);
 	return 0;
 }
 
@@ -176,7 +176,8 @@ static int rxe_create_ah(struct ib_ah *ibah,
 	if (err)
 		return err;
 
-	err = rxe_add_to_pool(&rxe->ah_pool, ah);
+	err = rxe_add_to_pool_ah(&rxe->ah_pool, ah,
+			init_attr->flags & RDMA_CREATE_AH_SLEEPABLE);
 	if (err)
 		return err;
 
@@ -188,7 +189,7 @@ static int rxe_create_ah(struct ib_ah *ibah,
 		err = copy_to_user(&uresp->ah_num, &ah->ah_num,
 					 sizeof(uresp->ah_num));
 		if (err) {
-			rxe_put(ah);
+			rxe_cleanup(ah);
 			return -EFAULT;
 		}
 	} else if (ah->is_user) {
@@ -197,6 +198,8 @@ static int rxe_create_ah(struct ib_ah *ibah,
 	}
 
 	rxe_init_av(init_attr->ah_attr, &ah->av);
+	rxe_finalize(ah);
+
 	return 0;
 }
 
@@ -228,7 +231,8 @@ static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags)
 {
 	struct rxe_ah *ah = to_rah(ibah);
 
-	rxe_put(ah);
+	rxe_cleanup_ah(ah, flags & RDMA_DESTROY_AH_SLEEPABLE);
+
 	return 0;
 }
 
@@ -308,12 +312,13 @@ static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init,
 
 	err = rxe_srq_from_init(rxe, srq, init, udata, uresp);
 	if (err)
-		goto err_put;
+		goto err_cleanup;
 
 	return 0;
 
-err_put:
-	rxe_put(srq);
+err_cleanup:
+	rxe_cleanup(srq);
+
 	return err;
 }
 
@@ -362,7 +367,7 @@ static int rxe_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
 {
 	struct rxe_srq *srq = to_rsrq(ibsrq);
 
-	rxe_put(srq);
+	rxe_cleanup(srq);
 	return 0;
 }
 
@@ -429,10 +434,11 @@ static int rxe_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init,
 	if (err)
 		goto qp_init;
 
+	rxe_finalize(qp);
 	return 0;
 
 qp_init:
-	rxe_put(qp);
+	rxe_cleanup(qp);
 	return err;
 }
 
@@ -485,7 +491,7 @@ static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 	if (ret)
 		return ret;
 
-	rxe_put(qp);
+	rxe_cleanup(qp);
 	return 0;
 }
 
@@ -803,7 +809,7 @@ static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 
 	rxe_cq_disable(cq);
 
-	rxe_put(cq);
+	rxe_cleanup(cq);
 	return 0;
 }
 
@@ -898,6 +904,7 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access)
 
 	rxe_get(pd);
 	rxe_mr_init_dma(pd, access, mr);
+	rxe_finalize(mr);
 
 	return &mr->ibmr;
 }
@@ -926,11 +933,13 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd,
 	if (err)
 		goto err3;
 
+	rxe_finalize(mr);
+
 	return &mr->ibmr;
 
 err3:
 	rxe_put(pd);
-	rxe_put(mr);
+	rxe_cleanup(mr);
 err2:
 	return ERR_PTR(err);
 }
@@ -958,35 +967,52 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
 	if (err)
 		goto err2;
 
+	rxe_finalize(mr);
+
 	return &mr->ibmr;
 
 err2:
 	rxe_put(pd);
-	rxe_put(mr);
+	rxe_cleanup(mr);
 err1:
 	return ERR_PTR(err);
 }
 
-/* build next_map_set from scatterlist
- * The IB_WR_REG_MR WR will swap map_sets
- */
+static int rxe_set_page(struct ib_mr *ibmr, u64 addr)
+{
+	struct rxe_mr *mr = to_rmr(ibmr);
+	struct rxe_map *map;
+	struct rxe_phys_buf *buf;
+
+	if (unlikely(mr->nbuf == mr->num_buf))
+		return -ENOMEM;
+
+	map = mr->map[mr->nbuf / RXE_BUF_PER_MAP];
+	buf = &map->buf[mr->nbuf % RXE_BUF_PER_MAP];
+
+	buf->addr = addr;
+	buf->size = ibmr->page_size;
+	mr->nbuf++;
+
+	return 0;
+}
+
 static int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
 			 int sg_nents, unsigned int *sg_offset)
 {
 	struct rxe_mr *mr = to_rmr(ibmr);
-	struct rxe_map_set *set = mr->next_map_set;
 	int n;
 
-	set->nbuf = 0;
+	mr->nbuf = 0;
 
-	n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rxe_mr_set_page);
+	n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rxe_set_page);
 
-	set->va = ibmr->iova;
-	set->iova = ibmr->iova;
-	set->length = ibmr->length;
-	set->page_shift = ilog2(ibmr->page_size);
-	set->page_mask = ibmr->page_size - 1;
-	set->offset = set->iova & set->page_mask;
+	mr->va = ibmr->iova;
+	mr->iova = ibmr->iova;
+	mr->length = ibmr->length;
+	mr->page_shift = ilog2(ibmr->page_size);
+	mr->page_mask = ibmr->page_size - 1;
+	mr->offset = mr->iova & mr->page_mask;
 
 	return n;
 }
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index ac464e68c923..96af3e054f4d 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -9,7 +9,6 @@
 
 #include <linux/interrupt.h>
 #include <linux/workqueue.h>
-#include <rdma/rdma_user_rxe.h>
 #include "rxe_pool.h"
 #include "rxe_task.h"
 #include "rxe_hw_counters.h"
@@ -124,11 +123,13 @@ struct rxe_req_info {
 	int			need_rd_atomic;
 	int			wait_psn;
 	int			need_retry;
+	int			wait_for_rnr_timer;
 	int			noack_pkts;
 	struct rxe_task		task;
 };
 
 struct rxe_comp_info {
+	enum rxe_qp_state	state;
 	u32			psn;
 	int			opcode;
 	int			timeout;
@@ -155,7 +156,7 @@ struct resp_res {
 
 	union {
 		struct {
-			struct sk_buff	*skb;
+			u64		orig_val;
 		} atomic;
 		struct {
 			u64		va_org;
@@ -189,7 +190,6 @@ struct rxe_resp_info {
 	u32			resid;
 	u32			rkey;
 	u32			length;
-	u64			atomic_orig;
 
 	/* SRQ only */
 	struct {
@@ -288,17 +288,6 @@ struct rxe_map {
 	struct rxe_phys_buf	buf[RXE_BUF_PER_MAP];
 };
 
-struct rxe_map_set {
-	struct rxe_map		**map;
-	u64			va;
-	u64			iova;
-	size_t			length;
-	u32			offset;
-	u32			nbuf;
-	int			page_shift;
-	int			page_mask;
-};
-
 static inline int rkey_is_mw(u32 rkey)
 {
 	u32 index = rkey >> 8;
@@ -316,20 +305,26 @@ struct rxe_mr {
 	u32			rkey;
 	enum rxe_mr_state	state;
 	enum ib_mr_type		type;
+	u64			va;
+	u64			iova;
+	size_t			length;
+	u32			offset;
 	int			access;
 
+	int			page_shift;
+	int			page_mask;
 	int			map_shift;
 	int			map_mask;
 
 	u32			num_buf;
+	u32			nbuf;
 
 	u32			max_buf;
 	u32			num_map;
 
 	atomic_t		num_mw;
 
-	struct rxe_map_set	*cur_map_set;
-	struct rxe_map_set	*next_map_set;
+	struct rxe_map		**map;
 };
 
 enum rxe_mw_state {
diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c
index 17f34d584cd9..f88d2971c2c6 100644
--- a/drivers/infiniband/sw/siw/siw_cm.c
+++ b/drivers/infiniband/sw/siw/siw_cm.c
@@ -725,11 +725,11 @@ static int siw_proc_mpareply(struct siw_cep *cep)
 	enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR;
 
 	rv = siw_recv_mpa_rr(cep);
-	if (rv != -EAGAIN)
-		siw_cancel_mpatimer(cep);
 	if (rv)
 		goto out_err;
 
+	siw_cancel_mpatimer(cep);
+
 	rep = &cep->mpa.hdr;
 
 	if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) {
@@ -895,7 +895,8 @@ static int siw_proc_mpareply(struct siw_cep *cep)
 	}
 
 out_err:
-	siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL);
+	if (rv != -EAGAIN)
+		siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL);
 
 	return rv;
 }
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
index 09316072b789..8dedae7ae79e 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -1167,7 +1167,7 @@ int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
 err_out:
 	siw_dbg(base_cq->device, "CQ creation failed: %d", rv);
 
-	if (cq && cq->queue) {
+	if (cq->queue) {
 		struct siw_ucontext *ctx =
 			rdma_udata_to_drv_context(udata, struct siw_ucontext,
 						  base_ucontext);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index f7995519bbc8..ed25061fac62 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -1109,7 +1109,7 @@ static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv)
 	 * if he sets the device address back to be based on GID index 0,
 	 * he no longer wishs to control it.
 	 *
-	 * If the user doesn't control the the device address,
+	 * If the user doesn't control the device address,
 	 * IPOIB_FLAG_DEV_ADDR_SET is set and ib_find_gid failed it means
 	 * the port GUID has changed and GID at index 0 has changed
 	 * so we need to change priv->local_gid and priv->dev->dev_addr
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 2a8961b685c2..a4904371e2db 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1664,8 +1664,10 @@ static void ipoib_napi_add(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = ipoib_priv(dev);
 
-	netif_napi_add(dev, &priv->recv_napi, ipoib_rx_poll, IPOIB_NUM_WC);
-	netif_napi_add(dev, &priv->send_napi, ipoib_tx_poll, MAX_SEND_CQE);
+	netif_napi_add_weight(dev, &priv->recv_napi, ipoib_rx_poll,
+			      IPOIB_NUM_WC);
+	netif_napi_add_weight(dev, &priv->send_napi, ipoib_tx_poll,
+			      MAX_SEND_CQE);
 }
 
 static void ipoib_napi_del(struct net_device *dev)
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index c08f2d9133b6..a00ca117303a 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -246,6 +246,7 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
 	device = ib_conn->device;
 	ib_dev = device->ib_device;
 
+	/* +1 for drain */
 	if (ib_conn->pi_support)
 		max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1;
 	else
@@ -267,7 +268,8 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
 	init_attr.qp_context = (void *)ib_conn;
 	init_attr.send_cq = ib_conn->cq;
 	init_attr.recv_cq = ib_conn->cq;
-	init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS;
+	/* +1 for drain */
+	init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS + 1;
 	init_attr.cap.max_send_sge = 2;
 	init_attr.cap.max_recv_sge = 1;
 	init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -485,7 +487,7 @@ int iser_conn_terminate(struct iser_conn *iser_conn)
 				 iser_conn, err);
 
 		/* block until all flush errors are consumed */
-		ib_drain_sq(ib_conn->qp);
+		ib_drain_qp(ib_conn->qp);
 	}
 
 	return 1;
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c b/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c
index 385a19846c24..1e6ffafa2db3 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c
+++ b/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c
@@ -32,11 +32,7 @@ void rtrs_clt_update_wc_stats(struct rtrs_clt_con *con)
 
 void rtrs_clt_inc_failover_cnt(struct rtrs_clt_stats *stats)
 {
-	struct rtrs_clt_stats_pcpu *s;
-
-	s = get_cpu_ptr(stats->pcpu_stats);
-	s->rdma.failover_cnt++;
-	put_cpu_ptr(stats->pcpu_stats);
+	this_cpu_inc(stats->pcpu_stats->rdma.failover_cnt);
 }
 
 int rtrs_clt_stats_migration_from_cnt_to_str(struct rtrs_clt_stats *stats, char *buf)
@@ -169,12 +165,8 @@ int rtrs_clt_reset_all_stats(struct rtrs_clt_stats *s, bool enable)
 static inline void rtrs_clt_update_rdma_stats(struct rtrs_clt_stats *stats,
 					       size_t size, int d)
 {
-	struct rtrs_clt_stats_pcpu *s;
-
-	s = get_cpu_ptr(stats->pcpu_stats);
-	s->rdma.dir[d].cnt++;
-	s->rdma.dir[d].size_total += size;
-	put_cpu_ptr(stats->pcpu_stats);
+	this_cpu_inc(stats->pcpu_stats->rdma.dir[d].cnt);
+	this_cpu_add(stats->pcpu_stats->rdma.dir[d].size_total, size);
 }
 
 void rtrs_clt_update_all_stats(struct rtrs_clt_io_req *req, int dir)
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c
index 9809c3883979..baecde41d126 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c
+++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c
@@ -740,25 +740,25 @@ struct path_it {
 	struct rtrs_clt_path *(*next_path)(struct path_it *it);
 };
 
-/**
- * list_next_or_null_rr_rcu - get next list element in round-robin fashion.
+/*
+ * rtrs_clt_get_next_path_or_null - get clt path from the list or return NULL
  * @head:	the head for the list.
- * @ptr:        the list head to take the next element from.
- * @type:       the type of the struct this is embedded in.
- * @memb:       the name of the list_head within the struct.
+ * @clt_path:	The element to take the next clt_path from.
  *
- * Next element returned in round-robin fashion, i.e. head will be skipped,
+ * Next clt path returned in round-robin fashion, i.e. head will be skipped,
  * but if list is observed as empty, NULL will be returned.
  *
- * This primitive may safely run concurrently with the _rcu list-mutation
+ * This function may safely run concurrently with the _rcu list-mutation
  * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
  */
-#define list_next_or_null_rr_rcu(head, ptr, type, memb) \
-({ \
-	list_next_or_null_rcu(head, ptr, type, memb) ?: \
-		list_next_or_null_rcu(head, READ_ONCE((ptr)->next), \
-				      type, memb); \
-})
+static inline struct rtrs_clt_path *
+rtrs_clt_get_next_path_or_null(struct list_head *head, struct rtrs_clt_path *clt_path)
+{
+	return list_next_or_null_rcu(head, &clt_path->s.entry, typeof(*clt_path), s.entry) ?:
+				     list_next_or_null_rcu(head,
+							   READ_ONCE((&clt_path->s.entry)->next),
+							   typeof(*clt_path), s.entry);
+}
 
 /**
  * get_next_path_rr() - Returns path in round-robin fashion.
@@ -789,10 +789,8 @@ static struct rtrs_clt_path *get_next_path_rr(struct path_it *it)
 		path = list_first_or_null_rcu(&clt->paths_list,
 					      typeof(*path), s.entry);
 	else
-		path = list_next_or_null_rr_rcu(&clt->paths_list,
-						&path->s.entry,
-						typeof(*path),
-						s.entry);
+		path = rtrs_clt_get_next_path_or_null(&clt->paths_list, path);
+
 	rcu_assign_pointer(*ppcpu_path, path);
 
 	return path;
@@ -1403,8 +1401,7 @@ static int alloc_permits(struct rtrs_clt_sess *clt)
 	unsigned int chunk_bits;
 	int err, i;
 
-	clt->permits_map = kcalloc(BITS_TO_LONGS(clt->queue_depth),
-				   sizeof(long), GFP_KERNEL);
+	clt->permits_map = bitmap_zalloc(clt->queue_depth, GFP_KERNEL);
 	if (!clt->permits_map) {
 		err = -ENOMEM;
 		goto out_err;
@@ -1426,7 +1423,7 @@ static int alloc_permits(struct rtrs_clt_sess *clt)
 	return 0;
 
 err_map:
-	kfree(clt->permits_map);
+	bitmap_free(clt->permits_map);
 	clt->permits_map = NULL;
 out_err:
 	return err;
@@ -1434,13 +1431,11 @@ out_err:
 
 static void free_permits(struct rtrs_clt_sess *clt)
 {
-	if (clt->permits_map) {
-		size_t sz = clt->queue_depth;
-
+	if (clt->permits_map)
 		wait_event(clt->permits_wait,
-			   find_first_bit(clt->permits_map, sz) >= sz);
-	}
-	kfree(clt->permits_map);
+			   bitmap_empty(clt->permits_map, clt->queue_depth));
+
+	bitmap_free(clt->permits_map);
 	clt->permits_map = NULL;
 	kfree(clt->permits);
 	clt->permits = NULL;
@@ -2277,8 +2272,7 @@ static void rtrs_clt_remove_path_from_arr(struct rtrs_clt_path *clt_path)
 	 * removed.  If @sess is the last element, then @next is NULL.
 	 */
 	rcu_read_lock();
-	next = list_next_or_null_rr_rcu(&clt->paths_list, &clt_path->s.entry,
-					typeof(*next), s.entry);
+	next = rtrs_clt_get_next_path_or_null(&clt->paths_list, clt_path);
 	rcu_read_unlock();
 
 	/*
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-pri.h b/drivers/infiniband/ulp/rtrs/rtrs-pri.h
index 9a1e5c2ae55c..ac0df734eba8 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-pri.h
+++ b/drivers/infiniband/ulp/rtrs/rtrs-pri.h
@@ -23,6 +23,17 @@
 #define RTRS_PROTO_VER_STRING __stringify(RTRS_PROTO_VER_MAJOR) "." \
 			       __stringify(RTRS_PROTO_VER_MINOR)
 
+/*
+ * Max IB immediate data size is 2^28 (MAX_IMM_PAYL_BITS)
+ * and the minimum chunk size is 4096 (2^12).
+ * So the maximum sess_queue_depth is 65536 (2^16) in theory.
+ * But mempool_create, create_qp and ib_post_send fail with
+ * "cannot allocate memory" error if sess_queue_depth is too big.
+ * Therefore the pratical max value of sess_queue_depth is
+ * somewhere between 1 and 65534 and it depends on the system.
+ */
+#define MAX_SESS_QUEUE_DEPTH 65535
+
 enum rtrs_imm_const {
 	MAX_IMM_TYPE_BITS = 4,
 	MAX_IMM_TYPE_MASK = ((1 << MAX_IMM_TYPE_BITS) - 1),
@@ -46,16 +57,6 @@ enum {
 
 	MAX_PATHS_NUM = 128,
 
-	/*
-	 * Max IB immediate data size is 2^28 (MAX_IMM_PAYL_BITS)
-	 * and the minimum chunk size is 4096 (2^12).
-	 * So the maximum sess_queue_depth is 65536 (2^16) in theory.
-	 * But mempool_create, create_qp and ib_post_send fail with
-	 * "cannot allocate memory" error if sess_queue_depth is too big.
-	 * Therefore the pratical max value of sess_queue_depth is
-	 * somewhere between 1 and 65534 and it depends on the system.
-	 */
-	MAX_SESS_QUEUE_DEPTH = 65535,
 	MIN_CHUNK_SIZE = 8192,
 
 	RTRS_HB_INTERVAL_MS = 5000,
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c
index 44b1c1652131..2aff1213a19d 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c
+++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c
@@ -14,9 +14,14 @@
 int rtrs_srv_reset_rdma_stats(struct rtrs_srv_stats *stats, bool enable)
 {
 	if (enable) {
-		struct rtrs_srv_stats_rdma_stats *r = &stats->rdma_stats;
+		int cpu;
+		struct rtrs_srv_stats_rdma_stats *r;
+
+		for_each_possible_cpu(cpu) {
+			r = per_cpu_ptr(stats->rdma_stats, cpu);
+			memset(r, 0, sizeof(*r));
+		}
 
-		memset(r, 0, sizeof(*r));
 		return 0;
 	}
 
@@ -25,11 +30,22 @@ int rtrs_srv_reset_rdma_stats(struct rtrs_srv_stats *stats, bool enable)
 
 ssize_t rtrs_srv_stats_rdma_to_str(struct rtrs_srv_stats *stats, char *page)
 {
-	struct rtrs_srv_stats_rdma_stats *r = &stats->rdma_stats;
+	int cpu;
+	struct rtrs_srv_stats_rdma_stats sum;
+	struct rtrs_srv_stats_rdma_stats *r;
 
-	return sysfs_emit(page, "%lld %lld %lld %lldn %u\n",
-			  (s64)atomic64_read(&r->dir[READ].cnt),
-			  (s64)atomic64_read(&r->dir[READ].size_total),
-			  (s64)atomic64_read(&r->dir[WRITE].cnt),
-			  (s64)atomic64_read(&r->dir[WRITE].size_total), 0);
+	memset(&sum, 0, sizeof(sum));
+
+	for_each_possible_cpu(cpu) {
+		r = per_cpu_ptr(stats->rdma_stats, cpu);
+
+		sum.dir[READ].cnt	  += r->dir[READ].cnt;
+		sum.dir[READ].size_total  += r->dir[READ].size_total;
+		sum.dir[WRITE].cnt	  += r->dir[WRITE].cnt;
+		sum.dir[WRITE].size_total += r->dir[WRITE].size_total;
+	}
+
+	return sysfs_emit(page, "%llu %llu %llu %llu\n",
+			  sum.dir[READ].cnt, sum.dir[READ].size_total,
+			  sum.dir[WRITE].cnt, sum.dir[WRITE].size_total);
 }
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c
index b94ae12c2795..2a3c9ac64a42 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c
+++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c
@@ -220,6 +220,8 @@ static void rtrs_srv_path_stats_release(struct kobject *kobj)
 
 	stats = container_of(kobj, struct rtrs_srv_stats, kobj_stats);
 
+	free_percpu(stats->rdma_stats);
+
 	kfree(stats);
 }
 
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
index 24024bce2566..34c03bde5064 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c
+++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
@@ -11,7 +11,6 @@
 #define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
 
 #include <linux/module.h>
-#include <linux/mempool.h>
 
 #include "rtrs-srv.h"
 #include "rtrs-log.h"
@@ -26,11 +25,7 @@ MODULE_LICENSE("GPL");
 #define DEFAULT_SESS_QUEUE_DEPTH 512
 #define MAX_HDR_SIZE PAGE_SIZE
 
-/* We guarantee to serve 10 paths at least */
-#define CHUNK_POOL_SZ 10
-
 static struct rtrs_rdma_dev_pd dev_pd;
-static mempool_t *chunk_pool;
 struct class *rtrs_dev_class;
 static struct rtrs_srv_ib_ctx ib_ctx;
 
@@ -1358,7 +1353,7 @@ static void free_srv(struct rtrs_srv_sess *srv)
 
 	WARN_ON(refcount_read(&srv->refcount));
 	for (i = 0; i < srv->queue_depth; i++)
-		mempool_free(srv->chunks[i], chunk_pool);
+		__free_pages(srv->chunks[i], get_order(max_chunk_size));
 	kfree(srv->chunks);
 	mutex_destroy(&srv->paths_mutex);
 	mutex_destroy(&srv->paths_ev_mutex);
@@ -1411,7 +1406,8 @@ static struct rtrs_srv_sess *get_or_create_srv(struct rtrs_srv_ctx *ctx,
 		goto err_free_srv;
 
 	for (i = 0; i < srv->queue_depth; i++) {
-		srv->chunks[i] = mempool_alloc(chunk_pool, GFP_KERNEL);
+		srv->chunks[i] = alloc_pages(GFP_KERNEL,
+					     get_order(max_chunk_size));
 		if (!srv->chunks[i])
 			goto err_free_chunks;
 	}
@@ -1424,7 +1420,7 @@ static struct rtrs_srv_sess *get_or_create_srv(struct rtrs_srv_ctx *ctx,
 
 err_free_chunks:
 	while (i--)
-		mempool_free(srv->chunks[i], chunk_pool);
+		__free_pages(srv->chunks[i], get_order(max_chunk_size));
 	kfree(srv->chunks);
 
 err_free_srv:
@@ -1513,6 +1509,7 @@ static void free_path(struct rtrs_srv_path *srv_path)
 		kobject_del(&srv_path->kobj);
 		kobject_put(&srv_path->kobj);
 	} else {
+		free_percpu(srv_path->stats->rdma_stats);
 		kfree(srv_path->stats);
 		kfree(srv_path);
 	}
@@ -1755,13 +1752,17 @@ static struct rtrs_srv_path *__alloc_path(struct rtrs_srv_sess *srv,
 	if (!srv_path->stats)
 		goto err_free_sess;
 
+	srv_path->stats->rdma_stats = alloc_percpu(struct rtrs_srv_stats_rdma_stats);
+	if (!srv_path->stats->rdma_stats)
+		goto err_free_stats;
+
 	srv_path->stats->srv_path = srv_path;
 
 	srv_path->dma_addr = kcalloc(srv->queue_depth,
 				     sizeof(*srv_path->dma_addr),
 				     GFP_KERNEL);
 	if (!srv_path->dma_addr)
-		goto err_free_stats;
+		goto err_free_percpu;
 
 	srv_path->s.con = kcalloc(con_num, sizeof(*srv_path->s.con),
 				  GFP_KERNEL);
@@ -1813,6 +1814,8 @@ err_free_con:
 	kfree(srv_path->s.con);
 err_free_dma_addr:
 	kfree(srv_path->dma_addr);
+err_free_percpu:
+	free_percpu(srv_path->stats->rdma_stats);
 err_free_stats:
 	kfree(srv_path->stats);
 err_free_sess:
@@ -2266,14 +2269,10 @@ static int __init rtrs_server_init(void)
 		       err);
 		return err;
 	}
-	chunk_pool = mempool_create_page_pool(sess_queue_depth * CHUNK_POOL_SZ,
-					      get_order(max_chunk_size));
-	if (!chunk_pool)
-		return -ENOMEM;
 	rtrs_dev_class = class_create(THIS_MODULE, "rtrs-server");
 	if (IS_ERR(rtrs_dev_class)) {
 		err = PTR_ERR(rtrs_dev_class);
-		goto out_chunk_pool;
+		goto out_err;
 	}
 	rtrs_wq = alloc_workqueue("rtrs_server_wq", 0, 0);
 	if (!rtrs_wq) {
@@ -2285,9 +2284,7 @@ static int __init rtrs_server_init(void)
 
 out_dev_class:
 	class_destroy(rtrs_dev_class);
-out_chunk_pool:
-	mempool_destroy(chunk_pool);
-
+out_err:
 	return err;
 }
 
@@ -2295,7 +2292,6 @@ static void __exit rtrs_server_exit(void)
 {
 	destroy_workqueue(rtrs_wq);
 	class_destroy(rtrs_dev_class);
-	mempool_destroy(chunk_pool);
 	rtrs_rdma_dev_pd_deinit(&dev_pd);
 }
 
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.h b/drivers/infiniband/ulp/rtrs/rtrs-srv.h
index 6292e87f6afd..186a63c217df 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-srv.h
+++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.h
@@ -12,6 +12,7 @@
 
 #include <linux/device.h>
 #include <linux/refcount.h>
+#include <linux/percpu.h>
 #include "rtrs-pri.h"
 
 /*
@@ -29,15 +30,15 @@ enum rtrs_srv_state {
  */
 struct rtrs_srv_stats_rdma_stats {
 	struct {
-		atomic64_t	cnt;
-		atomic64_t	size_total;
+		u64 cnt;
+		u64 size_total;
 	} dir[2];
 };
 
 struct rtrs_srv_stats {
-	struct kobject				kobj_stats;
-	struct rtrs_srv_stats_rdma_stats	rdma_stats;
-	struct rtrs_srv_path			*srv_path;
+	struct kobject					kobj_stats;
+	struct rtrs_srv_stats_rdma_stats __percpu	*rdma_stats;
+	struct rtrs_srv_path				*srv_path;
 };
 
 struct rtrs_srv_con {
@@ -130,8 +131,8 @@ void close_path(struct rtrs_srv_path *srv_path);
 static inline void rtrs_srv_update_rdma_stats(struct rtrs_srv_stats *s,
 					      size_t size, int d)
 {
-	atomic64_inc(&s->rdma_stats.dir[d].cnt);
-	atomic64_add(size, &s->rdma_stats.dir[d].size_total);
+	this_cpu_inc(s->rdma_stats->dir[d].cnt);
+	this_cpu_add(s->rdma_stats->dir[d].size_total, size);
 }
 
 /* functions which are implemented in rtrs-srv-stats.c */
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index f86ee1c4b970..21cbe30d526f 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -565,12 +565,9 @@ static int srpt_refresh_port(struct srpt_port *sport)
 	if (ret)
 		return ret;
 
-	sport->port_guid_id.wwn.priv = sport;
-	srpt_format_guid(sport->port_guid_id.name,
-			 sizeof(sport->port_guid_id.name),
+	srpt_format_guid(sport->guid_name, ARRAY_SIZE(sport->guid_name),
 			 &sport->gid.global.interface_id);
-	sport->port_gid_id.wwn.priv = sport;
-	snprintf(sport->port_gid_id.name, sizeof(sport->port_gid_id.name),
+	snprintf(sport->gid_name, ARRAY_SIZE(sport->gid_name),
 		 "0x%016llx%016llx",
 		 be64_to_cpu(sport->gid.global.subnet_prefix),
 		 be64_to_cpu(sport->gid.global.interface_id));
@@ -2221,13 +2218,13 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev,
 	ch->zw_cqe.done = srpt_zerolength_write_done;
 	INIT_WORK(&ch->release_work, srpt_release_channel_work);
 	ch->sport = sport;
-	if (ib_cm_id) {
-		ch->ib_cm.cm_id = ib_cm_id;
-		ib_cm_id->context = ch;
-	} else {
+	if (rdma_cm_id) {
 		ch->using_rdma_cm = true;
 		ch->rdma_cm.cm_id = rdma_cm_id;
 		rdma_cm_id->context = ch;
+	} else {
+		ch->ib_cm.cm_id = ib_cm_id;
+		ib_cm_id->context = ch;
 	}
 	/*
 	 * ch->rq_size should be at least as large as the initiator queue
@@ -2314,31 +2311,35 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev,
 	tag_num = ch->rq_size;
 	tag_size = 1; /* ib_srpt does not use se_sess->sess_cmd_map */
 
-	mutex_lock(&sport->port_guid_id.mutex);
-	list_for_each_entry(stpg, &sport->port_guid_id.tpg_list, entry) {
-		if (!IS_ERR_OR_NULL(ch->sess))
-			break;
-		ch->sess = target_setup_session(&stpg->tpg, tag_num,
+	if (sport->guid_id) {
+		mutex_lock(&sport->guid_id->mutex);
+		list_for_each_entry(stpg, &sport->guid_id->tpg_list, entry) {
+			if (!IS_ERR_OR_NULL(ch->sess))
+				break;
+			ch->sess = target_setup_session(&stpg->tpg, tag_num,
 						tag_size, TARGET_PROT_NORMAL,
 						ch->sess_name, ch, NULL);
+		}
+		mutex_unlock(&sport->guid_id->mutex);
 	}
-	mutex_unlock(&sport->port_guid_id.mutex);
 
-	mutex_lock(&sport->port_gid_id.mutex);
-	list_for_each_entry(stpg, &sport->port_gid_id.tpg_list, entry) {
-		if (!IS_ERR_OR_NULL(ch->sess))
-			break;
-		ch->sess = target_setup_session(&stpg->tpg, tag_num,
+	if (sport->gid_id) {
+		mutex_lock(&sport->gid_id->mutex);
+		list_for_each_entry(stpg, &sport->gid_id->tpg_list, entry) {
+			if (!IS_ERR_OR_NULL(ch->sess))
+				break;
+			ch->sess = target_setup_session(&stpg->tpg, tag_num,
 					tag_size, TARGET_PROT_NORMAL, i_port_id,
 					ch, NULL);
-		if (!IS_ERR_OR_NULL(ch->sess))
-			break;
-		/* Retry without leading "0x" */
-		ch->sess = target_setup_session(&stpg->tpg, tag_num,
+			if (!IS_ERR_OR_NULL(ch->sess))
+				break;
+			/* Retry without leading "0x" */
+			ch->sess = target_setup_session(&stpg->tpg, tag_num,
 						tag_size, TARGET_PROT_NORMAL,
 						i_port_id + 2, ch, NULL);
+		}
+		mutex_unlock(&sport->gid_id->mutex);
 	}
-	mutex_unlock(&sport->port_gid_id.mutex);
 
 	if (IS_ERR_OR_NULL(ch->sess)) {
 		WARN_ON_ONCE(ch->sess == NULL);
@@ -2983,7 +2984,12 @@ static int srpt_release_sport(struct srpt_port *sport)
 	return 0;
 }
 
-static struct se_wwn *__srpt_lookup_wwn(const char *name)
+struct port_and_port_id {
+	struct srpt_port *sport;
+	struct srpt_port_id **port_id;
+};
+
+static struct port_and_port_id __srpt_lookup_port(const char *name)
 {
 	struct ib_device *dev;
 	struct srpt_device *sdev;
@@ -2998,25 +3004,38 @@ static struct se_wwn *__srpt_lookup_wwn(const char *name)
 		for (i = 0; i < dev->phys_port_cnt; i++) {
 			sport = &sdev->port[i];
 
-			if (strcmp(sport->port_guid_id.name, name) == 0)
-				return &sport->port_guid_id.wwn;
-			if (strcmp(sport->port_gid_id.name, name) == 0)
-				return &sport->port_gid_id.wwn;
+			if (strcmp(sport->guid_name, name) == 0) {
+				kref_get(&sdev->refcnt);
+				return (struct port_and_port_id){
+					sport, &sport->guid_id};
+			}
+			if (strcmp(sport->gid_name, name) == 0) {
+				kref_get(&sdev->refcnt);
+				return (struct port_and_port_id){
+					sport, &sport->gid_id};
+			}
 		}
 	}
 
-	return NULL;
+	return (struct port_and_port_id){};
 }
 
-static struct se_wwn *srpt_lookup_wwn(const char *name)
+/**
+ * srpt_lookup_port() - Look up an RDMA port by name
+ * @name: ASCII port name
+ *
+ * Increments the RDMA port reference count if an RDMA port pointer is returned.
+ * The caller must drop that reference count by calling srpt_port_put_ref().
+ */
+static struct port_and_port_id srpt_lookup_port(const char *name)
 {
-	struct se_wwn *wwn;
+	struct port_and_port_id papi;
 
 	spin_lock(&srpt_dev_lock);
-	wwn = __srpt_lookup_wwn(name);
+	papi = __srpt_lookup_port(name);
 	spin_unlock(&srpt_dev_lock);
 
-	return wwn;
+	return papi;
 }
 
 static void srpt_free_srq(struct srpt_device *sdev)
@@ -3101,6 +3120,18 @@ static int srpt_use_srq(struct srpt_device *sdev, bool use_srq)
 	return ret;
 }
 
+static void srpt_free_sdev(struct kref *refcnt)
+{
+	struct srpt_device *sdev = container_of(refcnt, typeof(*sdev), refcnt);
+
+	kfree(sdev);
+}
+
+static void srpt_sdev_put(struct srpt_device *sdev)
+{
+	kref_put(&sdev->refcnt, srpt_free_sdev);
+}
+
 /**
  * srpt_add_one - InfiniBand device addition callback function
  * @device: Describes a HCA.
@@ -3119,6 +3150,7 @@ static int srpt_add_one(struct ib_device *device)
 	if (!sdev)
 		return -ENOMEM;
 
+	kref_init(&sdev->refcnt);
 	sdev->device = device;
 	mutex_init(&sdev->sdev_mutex);
 
@@ -3182,10 +3214,6 @@ static int srpt_add_one(struct ib_device *device)
 		sport->port_attrib.srp_sq_size = DEF_SRPT_SQ_SIZE;
 		sport->port_attrib.use_srq = false;
 		INIT_WORK(&sport->work, srpt_refresh_port_work);
-		mutex_init(&sport->port_guid_id.mutex);
-		INIT_LIST_HEAD(&sport->port_guid_id.tpg_list);
-		mutex_init(&sport->port_gid_id.mutex);
-		INIT_LIST_HEAD(&sport->port_gid_id.tpg_list);
 
 		ret = srpt_refresh_port(sport);
 		if (ret) {
@@ -3214,7 +3242,7 @@ err_ring:
 	srpt_free_srq(sdev);
 	ib_dealloc_pd(sdev->pd);
 free_dev:
-	kfree(sdev);
+	srpt_sdev_put(sdev);
 	pr_info("%s(%s) failed.\n", __func__, dev_name(&device->dev));
 	return ret;
 }
@@ -3258,7 +3286,7 @@ static void srpt_remove_one(struct ib_device *device, void *client_data)
 
 	ib_dealloc_pd(sdev->pd);
 
-	kfree(sdev);
+	srpt_sdev_put(sdev);
 }
 
 static struct ib_client srpt_client = {
@@ -3286,10 +3314,10 @@ static struct srpt_port_id *srpt_wwn_to_sport_id(struct se_wwn *wwn)
 {
 	struct srpt_port *sport = wwn->priv;
 
-	if (wwn == &sport->port_guid_id.wwn)
-		return &sport->port_guid_id;
-	if (wwn == &sport->port_gid_id.wwn)
-		return &sport->port_gid_id;
+	if (sport->guid_id && &sport->guid_id->wwn == wwn)
+		return sport->guid_id;
+	if (sport->gid_id && &sport->gid_id->wwn == wwn)
+		return sport->gid_id;
 	WARN_ON_ONCE(true);
 	return NULL;
 }
@@ -3774,7 +3802,31 @@ static struct se_wwn *srpt_make_tport(struct target_fabric_configfs *tf,
 				      struct config_group *group,
 				      const char *name)
 {
-	return srpt_lookup_wwn(name) ? : ERR_PTR(-EINVAL);
+	struct port_and_port_id papi = srpt_lookup_port(name);
+	struct srpt_port *sport = papi.sport;
+	struct srpt_port_id *port_id;
+
+	if (!papi.port_id)
+		return ERR_PTR(-EINVAL);
+	if (*papi.port_id) {
+		/* Attempt to create a directory that already exists. */
+		WARN_ON_ONCE(true);
+		return &(*papi.port_id)->wwn;
+	}
+	port_id = kzalloc(sizeof(*port_id), GFP_KERNEL);
+	if (!port_id) {
+		srpt_sdev_put(sport->sdev);
+		return ERR_PTR(-ENOMEM);
+	}
+	mutex_init(&port_id->mutex);
+	INIT_LIST_HEAD(&port_id->tpg_list);
+	port_id->wwn.priv = sport;
+	memcpy(port_id->name, port_id == sport->guid_id ? sport->guid_name :
+	       sport->gid_name, ARRAY_SIZE(port_id->name));
+
+	*papi.port_id = port_id;
+
+	return &port_id->wwn;
 }
 
 /**
@@ -3783,6 +3835,18 @@ static struct se_wwn *srpt_make_tport(struct target_fabric_configfs *tf,
  */
 static void srpt_drop_tport(struct se_wwn *wwn)
 {
+	struct srpt_port_id *port_id = container_of(wwn, typeof(*port_id), wwn);
+	struct srpt_port *sport = wwn->priv;
+
+	if (sport->guid_id == port_id)
+		sport->guid_id = NULL;
+	else if (sport->gid_id == port_id)
+		sport->gid_id = NULL;
+	else
+		WARN_ON_ONCE(true);
+
+	srpt_sdev_put(sport->sdev);
+	kfree(port_id);
 }
 
 static ssize_t srpt_wwn_version_show(struct config_item *item, char *buf)
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h
index 76e66f630c17..4c46b301eea1 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.h
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.h
@@ -376,7 +376,7 @@ struct srpt_tpg {
 };
 
 /**
- * struct srpt_port_id - information about an RDMA port name
+ * struct srpt_port_id - LIO RDMA port information
  * @mutex:	Protects @tpg_list changes.
  * @tpg_list:	TPGs associated with the RDMA port name.
  * @wwn:	WWN associated with the RDMA port name.
@@ -393,7 +393,7 @@ struct srpt_port_id {
 };
 
 /**
- * struct srpt_port - information associated by SRPT with a single IB port
+ * struct srpt_port - SRPT RDMA port information
  * @sdev:      backpointer to the HCA information.
  * @mad_agent: per-port management datagram processing information.
  * @enabled:   Whether or not this target port is enabled.
@@ -402,8 +402,10 @@ struct srpt_port_id {
  * @lid:       cached value of the port's lid.
  * @gid:       cached value of the port's gid.
  * @work:      work structure for refreshing the aforementioned cached values.
- * @port_guid_id: target port GUID
- * @port_gid_id: target port GID
+ * @guid_name: port name in GUID format.
+ * @guid_id:   LIO target port information for the port name in GUID format.
+ * @gid_name:  port name in GID format.
+ * @gid_id:    LIO target port information for the port name in GID format.
  * @port_attrib:   Port attributes that can be accessed through configfs.
  * @refcount:	   Number of objects associated with this port.
  * @freed_channels: Completion that will be signaled once @refcount becomes 0.
@@ -419,8 +421,10 @@ struct srpt_port {
 	u32			lid;
 	union ib_gid		gid;
 	struct work_struct	work;
-	struct srpt_port_id	port_guid_id;
-	struct srpt_port_id	port_gid_id;
+	char			guid_name[64];
+	struct srpt_port_id	*guid_id;
+	char			gid_name[64];
+	struct srpt_port_id	*gid_id;
 	struct srpt_port_attrib port_attrib;
 	atomic_t		refcount;
 	struct completion	*freed_channels;
@@ -430,6 +434,7 @@ struct srpt_port {
 
 /**
  * struct srpt_device - information associated by SRPT with a single HCA
+ * @refcnt:	   Reference count for this device.
  * @device:        Backpointer to the struct ib_device managed by the IB core.
  * @pd:            IB protection domain.
  * @lkey:          L_Key (local key) with write access to all local memory.
@@ -445,6 +450,7 @@ struct srpt_port {
  * @port:          Information about the ports owned by this HCA.
  */
 struct srpt_device {
+	struct kref		refcnt;
 	struct ib_device	*device;
 	struct ib_pd		*pd;
 	u32			lkey;
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index cf3e8096942a..529c9d04e9a4 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -29,7 +29,7 @@ config BCACHE_CLOSURES_DEBUG
 	operations that get stuck.
 
 config BCACHE_ASYNC_REGISTRATION
-	bool "Asynchronous device registration (EXPERIMENTAL)"
+	bool "Asynchronous device registration"
 	depends on BCACHE
 	help
 	Add a sysfs file /sys/fs/bcache/register_async. Writing registering
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 1ec17c32867f..c640be453313 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3728,6 +3728,7 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
 	if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) {
 		if (mddev->sync_thread) {
 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+			md_unregister_thread(&mddev->sync_thread);
 			md_reap_sync_thread(mddev);
 		}
 	} else if (decipher_sync_action(mddev, mddev->recovery) != st_idle)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 99642f69bfa7..28bd4a35b86b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1016,7 +1016,7 @@ static void dm_wq_requeue_work(struct work_struct *work)
 	while (io) {
 		struct dm_io *next = io->next;
 
-		dm_io_rewind(io, &md->queue->bio_split);
+		dm_io_rewind(io, &md->disk->bio_split);
 
 		io->next = NULL;
 		__dm_io_complete(io, false);
@@ -1181,7 +1181,7 @@ static sector_t max_io_len(struct dm_target *ti, sector_t sector)
 	 * Does the target need to split IO even further?
 	 * - varied (per target) IO splitting is a tenet of DM; this
 	 *   explains why stacked chunk_sectors based splitting via
-	 *   blk_queue_split() isn't possible here.
+	 *   bio_split_to_limits() isn't possible here.
 	 */
 	if (!ti->max_io_len)
 		return len;
@@ -1751,10 +1751,10 @@ static void dm_split_and_process_bio(struct mapped_device *md,
 	is_abnormal = is_abnormal_io(bio);
 	if (unlikely(is_abnormal)) {
 		/*
-		 * Use blk_queue_split() for abnormal IO (e.g. discard, etc)
+		 * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc)
 		 * otherwise associated queue_limits won't be imposed.
 		 */
-		blk_queue_split(&bio);
+		bio = bio_split_to_limits(bio);
 	}
 
 	init_clone_info(&ci, md, map, bio, is_abnormal);
diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c
index 2cf973722f59..91836e6de326 100644
--- a/drivers/md/md-autodetect.c
+++ b/drivers/md/md-autodetect.c
@@ -125,7 +125,6 @@ static void __init md_setup_drive(struct md_setup_args *args)
 	char *devname = args->device_names;
 	dev_t devices[MD_SB_DISKS + 1], mdev;
 	struct mdu_array_info_s ainfo = { };
-	struct block_device *bdev;
 	struct mddev *mddev;
 	int err = 0, i;
 	char name[16];
@@ -169,24 +168,16 @@ static void __init md_setup_drive(struct md_setup_args *args)
 
 	pr_info("md: Loading %s: %s\n", name, args->device_names);
 
-	bdev = blkdev_get_by_dev(mdev, FMODE_READ, NULL);
-	if (IS_ERR(bdev)) {
-		pr_err("md: open failed - cannot start array %s\n", name);
+	mddev = md_alloc(mdev, name);
+	if (IS_ERR(mddev)) {
+		pr_err("md: md_alloc failed - cannot start array %s\n", name);
 		return;
 	}
 
-	err = -EIO;
-	if (WARN(bdev->bd_disk->fops != &md_fops,
-			"Opening block device %x resulted in non-md device\n",
-			mdev))
-		goto out_blkdev_put;
-
-	mddev = bdev->bd_disk->private_data;
-
 	err = mddev_lock(mddev);
 	if (err) {
 		pr_err("md: failed to lock array %s\n", name);
-		goto out_blkdev_put;
+		goto out_mddev_put;
 	}
 
 	if (!list_empty(&mddev->disks) || mddev->raid_disks) {
@@ -230,8 +221,8 @@ static void __init md_setup_drive(struct md_setup_args *args)
 		pr_warn("md: starting %s failed\n", name);
 out_unlock:
 	mddev_unlock(mddev);
-out_blkdev_put:
-	blkdev_put(bdev, FMODE_READ);
+out_mddev_put:
+	mddev_put(mddev);
 }
 
 static int __init raid_setup(char *str)
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 37cbcce3cc66..742b2349fea3 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -40,7 +40,7 @@ struct resync_info {
 
 /* Lock the send communication. This is done through
  * bit manipulation as opposed to a mutex in order to
- * accomodate lock and hold. See next comment.
+ * accommodate lock and hold. See next comment.
  */
 #define		MD_CLUSTER_SEND_LOCK			4
 /* If cluster operations (such as adding a disk) must lock the
@@ -689,7 +689,7 @@ static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
 	/*
 	 * If resync thread run after raid1d thread, then process_metadata_update
 	 * could not continue if raid1d held reconfig_mutex (and raid1d is blocked
-	 * since another node already got EX on Token and waitting the EX of Ack),
+	 * since another node already got EX on Token and waiting the EX of Ack),
 	 * so let resync wake up thread in case flag is set.
 	 */
 	if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4df78e30b76a..afaf36b2f6ab 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -368,28 +368,6 @@ EXPORT_SYMBOL_GPL(md_new_event);
 static LIST_HEAD(all_mddevs);
 static DEFINE_SPINLOCK(all_mddevs_lock);
 
-/*
- * iterates through all used mddevs in the system.
- * We take care to grab the all_mddevs_lock whenever navigating
- * the list, and to always hold a refcount when unlocked.
- * Any code which breaks out of this loop while own
- * a reference to the current mddev and must mddev_put it.
- */
-#define for_each_mddev(_mddev,_tmp)					\
-									\
-	for (({ spin_lock(&all_mddevs_lock);				\
-		_tmp = all_mddevs.next;					\
-		_mddev = NULL;});					\
-	     ({ if (_tmp != &all_mddevs)				\
-			mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
-		spin_unlock(&all_mddevs_lock);				\
-		if (_mddev) mddev_put(_mddev);				\
-		_mddev = list_entry(_tmp, struct mddev, all_mddevs);	\
-		_tmp != &all_mddevs;});					\
-	     ({ spin_lock(&all_mddevs_lock);				\
-		_tmp = _tmp->next;})					\
-		)
-
 /* Rather than calling directly into the personality make_request function,
  * IO requests come here first so that we can check if the device is
  * being suspended pending a reconfiguration.
@@ -464,7 +442,7 @@ static void md_submit_bio(struct bio *bio)
 		return;
 	}
 
-	blk_queue_split(&bio);
+	bio = bio_split_to_limits(bio);
 
 	if (mddev->ro == 1 && unlikely(rw == WRITE)) {
 		if (bio_sectors(bio) != 0)
@@ -647,13 +625,17 @@ EXPORT_SYMBOL(md_flush_request);
 
 static inline struct mddev *mddev_get(struct mddev *mddev)
 {
+	lockdep_assert_held(&all_mddevs_lock);
+
+	if (test_bit(MD_DELETED, &mddev->flags))
+		return NULL;
 	atomic_inc(&mddev->active);
 	return mddev;
 }
 
 static void mddev_delayed_delete(struct work_struct *ws);
 
-static void mddev_put(struct mddev *mddev)
+void mddev_put(struct mddev *mddev)
 {
 	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
 		return;
@@ -661,7 +643,7 @@ static void mddev_put(struct mddev *mddev)
 	    mddev->ctime == 0 && !mddev->hold_active) {
 		/* Array is not configured at all, and not held active,
 		 * so destroy it */
-		list_del_init(&mddev->all_mddevs);
+		set_bit(MD_DELETED, &mddev->flags);
 
 		/*
 		 * Call queue_work inside the spinlock so that
@@ -678,7 +660,6 @@ static void md_safemode_timeout(struct timer_list *t);
 
 void mddev_init(struct mddev *mddev)
 {
-	kobject_init(&mddev->kobj, &md_ktype);
 	mutex_init(&mddev->open_mutex);
 	mutex_init(&mddev->reconfig_mutex);
 	mutex_init(&mddev->bitmap_info.mutex);
@@ -733,22 +714,6 @@ static dev_t mddev_alloc_unit(void)
 	return dev;
 }
 
-static struct mddev *mddev_find(dev_t unit)
-{
-	struct mddev *mddev;
-
-	if (MAJOR(unit) != MD_MAJOR)
-		unit &= ~((1 << MdpMinorShift) - 1);
-
-	spin_lock(&all_mddevs_lock);
-	mddev = mddev_find_locked(unit);
-	if (mddev)
-		mddev_get(mddev);
-	spin_unlock(&all_mddevs_lock);
-
-	return mddev;
-}
-
 static struct mddev *mddev_alloc(dev_t unit)
 {
 	struct mddev *new;
@@ -791,6 +756,15 @@ out_free_new:
 	return ERR_PTR(error);
 }
 
+static void mddev_free(struct mddev *mddev)
+{
+	spin_lock(&all_mddevs_lock);
+	list_del(&mddev->all_mddevs);
+	spin_unlock(&all_mddevs_lock);
+
+	kfree(mddev);
+}
+
 static const struct attribute_group md_redundancy_group;
 
 void mddev_unlock(struct mddev *mddev)
@@ -3335,14 +3309,35 @@ rdev_size_show(struct md_rdev *rdev, char *page)
 	return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
 }
 
-static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
+static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b)
 {
 	/* check if two start/length pairs overlap */
-	if (s1+l1 <= s2)
-		return 0;
-	if (s2+l2 <= s1)
-		return 0;
-	return 1;
+	if (a->data_offset + a->sectors <= b->data_offset)
+		return false;
+	if (b->data_offset + b->sectors <= a->data_offset)
+		return false;
+	return true;
+}
+
+static bool md_rdev_overlaps(struct md_rdev *rdev)
+{
+	struct mddev *mddev;
+	struct md_rdev *rdev2;
+
+	spin_lock(&all_mddevs_lock);
+	list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
+		if (test_bit(MD_DELETED, &mddev->flags))
+			continue;
+		rdev_for_each(rdev2, mddev) {
+			if (rdev != rdev2 && rdev->bdev == rdev2->bdev &&
+			    md_rdevs_overlap(rdev, rdev2)) {
+				spin_unlock(&all_mddevs_lock);
+				return true;
+			}
+		}
+	}
+	spin_unlock(&all_mddevs_lock);
+	return false;
 }
 
 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
@@ -3394,46 +3389,21 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
 		return -EINVAL; /* component must fit device */
 
 	rdev->sectors = sectors;
-	if (sectors > oldsectors && my_mddev->external) {
-		/* Need to check that all other rdevs with the same
-		 * ->bdev do not overlap.  'rcu' is sufficient to walk
-		 * the rdev lists safely.
-		 * This check does not provide a hard guarantee, it
-		 * just helps avoid dangerous mistakes.
+
+	/*
+	 * Check that all other rdevs with the same bdev do not overlap.  This
+	 * check does not provide a hard guarantee, it just helps avoid
+	 * dangerous mistakes.
+	 */
+	if (sectors > oldsectors && my_mddev->external &&
+	    md_rdev_overlaps(rdev)) {
+		/*
+		 * Someone else could have slipped in a size change here, but
+		 * doing so is just silly.  We put oldsectors back because we
+		 * know it is safe, and trust userspace not to race with itself.
 		 */
-		struct mddev *mddev;
-		int overlap = 0;
-		struct list_head *tmp;
-
-		rcu_read_lock();
-		for_each_mddev(mddev, tmp) {
-			struct md_rdev *rdev2;
-
-			rdev_for_each(rdev2, mddev)
-				if (rdev->bdev == rdev2->bdev &&
-				    rdev != rdev2 &&
-				    overlaps(rdev->data_offset, rdev->sectors,
-					     rdev2->data_offset,
-					     rdev2->sectors)) {
-					overlap = 1;
-					break;
-				}
-			if (overlap) {
-				mddev_put(mddev);
-				break;
-			}
-		}
-		rcu_read_unlock();
-		if (overlap) {
-			/* Someone else could have slipped in a size
-			 * change here, but doing so is just silly.
-			 * We put oldsectors back because we *know* it is
-			 * safe, and trust userspace not to race with
-			 * itself
-			 */
-			rdev->sectors = oldsectors;
-			return -EBUSY;
-		}
+		rdev->sectors = oldsectors;
+		return -EBUSY;
 	}
 	return len;
 }
@@ -4830,6 +4800,19 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 			if (work_pending(&mddev->del_work))
 				flush_workqueue(md_misc_wq);
 			if (mddev->sync_thread) {
+				sector_t save_rp = mddev->reshape_position;
+
+				mddev_unlock(mddev);
+				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+				md_unregister_thread(&mddev->sync_thread);
+				mddev_lock_nointr(mddev);
+				/*
+				 * set RECOVERY_INTR again and restore reshape
+				 * position in case others changed them after
+				 * got lock, eg, reshape_position_store and
+				 * md_check_recovery.
+				 */
+				mddev->reshape_position = save_rp;
 				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 				md_reap_sync_thread(mddev);
 			}
@@ -5001,7 +4984,7 @@ static ssize_t
 sync_speed_show(struct mddev *mddev, char *page)
 {
 	unsigned long resync, dt, db;
-	if (mddev->curr_resync == 0)
+	if (mddev->curr_resync == MD_RESYNC_NONE)
 		return sprintf(page, "none\n");
 	resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
 	dt = (jiffies - mddev->resync_mark) / HZ;
@@ -5020,8 +5003,8 @@ sync_completed_show(struct mddev *mddev, char *page)
 	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		return sprintf(page, "none\n");
 
-	if (mddev->curr_resync == 1 ||
-	    mddev->curr_resync == 2)
+	if (mddev->curr_resync == MD_RESYNC_YIELDED ||
+	    mddev->curr_resync == MD_RESYNC_DELAYED)
 		return sprintf(page, "delayed\n");
 
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
@@ -5532,11 +5515,10 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 	if (!entry->show)
 		return -EIO;
 	spin_lock(&all_mddevs_lock);
-	if (list_empty(&mddev->all_mddevs)) {
+	if (!mddev_get(mddev)) {
 		spin_unlock(&all_mddevs_lock);
 		return -EBUSY;
 	}
-	mddev_get(mddev);
 	spin_unlock(&all_mddevs_lock);
 
 	rv = entry->show(mddev, page);
@@ -5557,18 +5539,17 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
 	spin_lock(&all_mddevs_lock);
-	if (list_empty(&mddev->all_mddevs)) {
+	if (!mddev_get(mddev)) {
 		spin_unlock(&all_mddevs_lock);
 		return -EBUSY;
 	}
-	mddev_get(mddev);
 	spin_unlock(&all_mddevs_lock);
 	rv = entry->store(mddev, page, length);
 	mddev_put(mddev);
 	return rv;
 }
 
-static void md_free(struct kobject *ko)
+static void md_kobj_release(struct kobject *ko)
 {
 	struct mddev *mddev = container_of(ko, struct mddev, kobj);
 
@@ -5577,15 +5558,8 @@ static void md_free(struct kobject *ko)
 	if (mddev->sysfs_level)
 		sysfs_put(mddev->sysfs_level);
 
-	if (mddev->gendisk) {
-		del_gendisk(mddev->gendisk);
-		put_disk(mddev->gendisk);
-	}
-	percpu_ref_exit(&mddev->writes_pending);
-
-	bioset_exit(&mddev->bio_set);
-	bioset_exit(&mddev->sync_set);
-	kfree(mddev);
+	del_gendisk(mddev->gendisk);
+	put_disk(mddev->gendisk);
 }
 
 static const struct sysfs_ops md_sysfs_ops = {
@@ -5593,7 +5567,7 @@ static const struct sysfs_ops md_sysfs_ops = {
 	.store	= md_attr_store,
 };
 static struct kobj_type md_ktype = {
-	.release	= md_free,
+	.release	= md_kobj_release,
 	.sysfs_ops	= &md_sysfs_ops,
 	.default_groups	= md_attr_groups,
 };
@@ -5604,7 +5578,6 @@ static void mddev_delayed_delete(struct work_struct *ws)
 {
 	struct mddev *mddev = container_of(ws, struct mddev, del_work);
 
-	kobject_del(&mddev->kobj);
 	kobject_put(&mddev->kobj);
 }
 
@@ -5623,7 +5596,7 @@ int mddev_init_writes_pending(struct mddev *mddev)
 }
 EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
 
-static int md_alloc(dev_t dev, char *name)
+struct mddev *md_alloc(dev_t dev, char *name)
 {
 	/*
 	 * If dev is zero, name is the name of a device to allocate with
@@ -5651,8 +5624,8 @@ static int md_alloc(dev_t dev, char *name)
 	mutex_lock(&disks_mutex);
 	mddev = mddev_alloc(dev);
 	if (IS_ERR(mddev)) {
-		mutex_unlock(&disks_mutex);
-		return PTR_ERR(mddev);
+		error = PTR_ERR(mddev);
+		goto out_unlock;
 	}
 
 	partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
@@ -5670,7 +5643,7 @@ static int md_alloc(dev_t dev, char *name)
 			    strcmp(mddev2->gendisk->disk_name, name) == 0) {
 				spin_unlock(&all_mddevs_lock);
 				error = -EEXIST;
-				goto out_unlock_disks_mutex;
+				goto out_free_mddev;
 			}
 		spin_unlock(&all_mddevs_lock);
 	}
@@ -5683,7 +5656,7 @@ static int md_alloc(dev_t dev, char *name)
 	error = -ENOMEM;
 	disk = blk_alloc_disk(NUMA_NO_NODE);
 	if (!disk)
-		goto out_unlock_disks_mutex;
+		goto out_free_mddev;
 
 	disk->major = MAJOR(mddev->unit);
 	disk->first_minor = unit << shift;
@@ -5704,25 +5677,45 @@ static int md_alloc(dev_t dev, char *name)
 	mddev->gendisk = disk;
 	error = add_disk(disk);
 	if (error)
-		goto out_cleanup_disk;
+		goto out_put_disk;
 
+	kobject_init(&mddev->kobj, &md_ktype);
 	error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
-	if (error)
-		goto out_del_gendisk;
+	if (error) {
+		/*
+		 * The disk is already live at this point.  Clear the hold flag
+		 * and let mddev_put take care of the deletion, as it isn't any
+		 * different from a normal close on last release now.
+		 */
+		mddev->hold_active = 0;
+		mutex_unlock(&disks_mutex);
+		mddev_put(mddev);
+		return ERR_PTR(error);
+	}
 
 	kobject_uevent(&mddev->kobj, KOBJ_ADD);
 	mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
 	mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
-	goto out_unlock_disks_mutex;
-
-out_del_gendisk:
-	del_gendisk(disk);
-out_cleanup_disk:
-	put_disk(disk);
-out_unlock_disks_mutex:
 	mutex_unlock(&disks_mutex);
+	return mddev;
+
+out_put_disk:
+	put_disk(disk);
+out_free_mddev:
+	mddev_free(mddev);
+out_unlock:
+	mutex_unlock(&disks_mutex);
+	return ERR_PTR(error);
+}
+
+static int md_alloc_and_put(dev_t dev, char *name)
+{
+	struct mddev *mddev = md_alloc(dev, name);
+
+	if (IS_ERR(mddev))
+		return PTR_ERR(mddev);
 	mddev_put(mddev);
-	return error;
+	return 0;
 }
 
 static void md_probe(dev_t dev)
@@ -5730,7 +5723,7 @@ static void md_probe(dev_t dev)
 	if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512)
 		return;
 	if (create_on_open)
-		md_alloc(dev, NULL);
+		md_alloc_and_put(dev, NULL);
 }
 
 static int add_named_array(const char *val, const struct kernel_param *kp)
@@ -5752,12 +5745,12 @@ static int add_named_array(const char *val, const struct kernel_param *kp)
 		return -E2BIG;
 	strscpy(buf, val, len+1);
 	if (strncmp(buf, "md_", 3) == 0)
-		return md_alloc(0, buf);
+		return md_alloc_and_put(0, buf);
 	if (strncmp(buf, "md", 2) == 0 &&
 	    isdigit(buf[2]) &&
 	    kstrtoul(buf+2, 10, &devnum) == 0 &&
 	    devnum <= MINORMASK)
-		return md_alloc(MKDEV(MD_MAJOR, devnum), NULL);
+		return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL);
 
 	return -EINVAL;
 }
@@ -6197,6 +6190,7 @@ static void __md_stop_writes(struct mddev *mddev)
 		flush_workqueue(md_misc_wq);
 	if (mddev->sync_thread) {
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+		md_unregister_thread(&mddev->sync_thread);
 		md_reap_sync_thread(mddev);
 	}
 
@@ -6244,11 +6238,11 @@ static void mddev_detach(struct mddev *mddev)
 static void __md_stop(struct mddev *mddev)
 {
 	struct md_personality *pers = mddev->pers;
-	md_bitmap_destroy(mddev);
 	mddev_detach(mddev);
 	/* Ensure ->event_work is done */
 	if (mddev->event_work.func)
 		flush_workqueue(md_misc_wq);
+	md_bitmap_destroy(mddev);
 	spin_lock(&mddev->lock);
 	mddev->pers = NULL;
 	spin_unlock(&mddev->lock);
@@ -6497,9 +6491,8 @@ static void autorun_devices(int part)
 			break;
 		}
 
-		md_probe(dev);
-		mddev = mddev_find(dev);
-		if (!mddev)
+		mddev = md_alloc(dev, NULL);
+		if (IS_ERR(mddev))
 			break;
 
 		if (mddev_lock(mddev))
@@ -7782,45 +7775,33 @@ out_unlock:
 
 static int md_open(struct block_device *bdev, fmode_t mode)
 {
-	/*
-	 * Succeed if we can lock the mddev, which confirms that
-	 * it isn't being stopped right now.
-	 */
-	struct mddev *mddev = mddev_find(bdev->bd_dev);
+	struct mddev *mddev;
 	int err;
 
+	spin_lock(&all_mddevs_lock);
+	mddev = mddev_get(bdev->bd_disk->private_data);
+	spin_unlock(&all_mddevs_lock);
 	if (!mddev)
 		return -ENODEV;
 
-	if (mddev->gendisk != bdev->bd_disk) {
-		/* we are racing with mddev_put which is discarding this
-		 * bd_disk.
-		 */
-		mddev_put(mddev);
-		/* Wait until bdev->bd_disk is definitely gone */
-		if (work_pending(&mddev->del_work))
-			flush_workqueue(md_misc_wq);
-		return -EBUSY;
-	}
-	BUG_ON(mddev != bdev->bd_disk->private_data);
-
-	if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
+	err = mutex_lock_interruptible(&mddev->open_mutex);
+	if (err)
 		goto out;
 
-	if (test_bit(MD_CLOSING, &mddev->flags)) {
-		mutex_unlock(&mddev->open_mutex);
-		err = -ENODEV;
-		goto out;
-	}
+	err = -ENODEV;
+	if (test_bit(MD_CLOSING, &mddev->flags))
+		goto out_unlock;
 
-	err = 0;
 	atomic_inc(&mddev->openers);
 	mutex_unlock(&mddev->open_mutex);
 
 	bdev_check_media_change(bdev);
- out:
-	if (err)
-		mddev_put(mddev);
+	return 0;
+
+out_unlock:
+	mutex_unlock(&mddev->open_mutex);
+out:
+	mddev_put(mddev);
 	return err;
 }
 
@@ -7844,6 +7825,17 @@ static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing)
 	return ret;
 }
 
+static void md_free_disk(struct gendisk *disk)
+{
+	struct mddev *mddev = disk->private_data;
+
+	percpu_ref_exit(&mddev->writes_pending);
+	bioset_exit(&mddev->bio_set);
+	bioset_exit(&mddev->sync_set);
+
+	mddev_free(mddev);
+}
+
 const struct block_device_operations md_fops =
 {
 	.owner		= THIS_MODULE,
@@ -7857,6 +7849,7 @@ const struct block_device_operations md_fops =
 	.getgeo		= md_getgeo,
 	.check_events	= md_check_events,
 	.set_read_only	= md_set_read_only,
+	.free_disk	= md_free_disk,
 };
 
 static int md_thread(void *arg)
@@ -8018,16 +8011,26 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
 		max_sectors = mddev->dev_sectors;
 
 	resync = mddev->curr_resync;
-	if (resync <= 3) {
+	if (resync < MD_RESYNC_ACTIVE) {
 		if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
 			/* Still cleaning up */
 			resync = max_sectors;
-	} else if (resync > max_sectors)
+	} else if (resync > max_sectors) {
 		resync = max_sectors;
-	else
+	} else {
 		resync -= atomic_read(&mddev->recovery_active);
+		if (resync < MD_RESYNC_ACTIVE) {
+			/*
+			 * Resync has started, but the subtraction has
+			 * yielded one of the special values. Force it
+			 * to active to ensure the status reports an
+			 * active resync.
+			 */
+			resync = MD_RESYNC_ACTIVE;
+		}
+	}
 
-	if (resync == 0) {
+	if (resync == MD_RESYNC_NONE) {
 		if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
 			struct md_rdev *rdev;
 
@@ -8051,7 +8054,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
 		}
 		return 0;
 	}
-	if (resync < 3) {
+	if (resync < MD_RESYNC_ACTIVE) {
 		seq_printf(seq, "\tresync=DELAYED");
 		return 1;
 	}
@@ -8152,6 +8155,8 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos)
 		if (!l--) {
 			mddev = list_entry(tmp, struct mddev, all_mddevs);
 			mddev_get(mddev);
+			if (!mddev_get(mddev))
+				continue;
 			spin_unlock(&all_mddevs_lock);
 			return mddev;
 		}
@@ -8165,25 +8170,35 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct list_head *tmp;
 	struct mddev *next_mddev, *mddev = v;
+	struct mddev *to_put = NULL;
 
 	++*pos;
 	if (v == (void*)2)
 		return NULL;
 
 	spin_lock(&all_mddevs_lock);
-	if (v == (void*)1)
+	if (v == (void*)1) {
 		tmp = all_mddevs.next;
-	else
+	} else {
+		to_put = mddev;
+		tmp = mddev->all_mddevs.next;
+	}
+
+	for (;;) {
+		if (tmp == &all_mddevs) {
+			next_mddev = (void*)2;
+			*pos = 0x10000;
+			break;
+		}
+		next_mddev = list_entry(tmp, struct mddev, all_mddevs);
+		if (mddev_get(next_mddev))
+			break;
+		mddev = next_mddev;
 		tmp = mddev->all_mddevs.next;
-	if (tmp != &all_mddevs)
-		next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
-	else {
-		next_mddev = (void*)2;
-		*pos = 0x10000;
 	}
 	spin_unlock(&all_mddevs_lock);
 
-	if (v != (void*)1)
+	if (to_put)
 		mddev_put(mddev);
 	return next_mddev;
 
@@ -8682,7 +8697,6 @@ void md_do_sync(struct md_thread *thread)
 	unsigned long update_time;
 	sector_t mark_cnt[SYNC_MARKS];
 	int last_mark,m;
-	struct list_head *tmp;
 	sector_t last_check;
 	int skipped = 0;
 	struct md_rdev *rdev;
@@ -8729,13 +8743,7 @@ void md_do_sync(struct md_thread *thread)
 
 	mddev->last_sync_action = action ?: desc;
 
-	/* we overload curr_resync somewhat here.
-	 * 0 == not engaged in resync at all
-	 * 2 == checking that there is no conflict with another sync
-	 * 1 == like 2, but have yielded to allow conflicting resync to
-	 *		commence
-	 * other == active in resync - this many blocks
-	 *
+	/*
 	 * Before starting a resync we must have set curr_resync to
 	 * 2, and then checked that every "conflicting" array has curr_resync
 	 * less than ours.  When we find one that is the same or higher
@@ -8747,24 +8755,29 @@ void md_do_sync(struct md_thread *thread)
 
 	do {
 		int mddev2_minor = -1;
-		mddev->curr_resync = 2;
+		mddev->curr_resync = MD_RESYNC_DELAYED;
 
 	try_again:
 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
 			goto skip;
-		for_each_mddev(mddev2, tmp) {
+		spin_lock(&all_mddevs_lock);
+		list_for_each_entry(mddev2, &all_mddevs, all_mddevs) {
+			if (test_bit(MD_DELETED, &mddev2->flags))
+				continue;
 			if (mddev2 == mddev)
 				continue;
 			if (!mddev->parallel_resync
 			&&  mddev2->curr_resync
 			&&  match_mddev_units(mddev, mddev2)) {
 				DEFINE_WAIT(wq);
-				if (mddev < mddev2 && mddev->curr_resync == 2) {
+				if (mddev < mddev2 &&
+				    mddev->curr_resync == MD_RESYNC_DELAYED) {
 					/* arbitrarily yield */
-					mddev->curr_resync = 1;
+					mddev->curr_resync = MD_RESYNC_YIELDED;
 					wake_up(&resync_wait);
 				}
-				if (mddev > mddev2 && mddev->curr_resync == 1)
+				if (mddev > mddev2 &&
+				    mddev->curr_resync == MD_RESYNC_YIELDED)
 					/* no need to wait here, we can wait the next
 					 * time 'round when curr_resync == 2
 					 */
@@ -8782,7 +8795,8 @@ void md_do_sync(struct md_thread *thread)
 							desc, mdname(mddev),
 							mdname(mddev2));
 					}
-					mddev_put(mddev2);
+					spin_unlock(&all_mddevs_lock);
+
 					if (signal_pending(current))
 						flush_signals(current);
 					schedule();
@@ -8792,7 +8806,8 @@ void md_do_sync(struct md_thread *thread)
 				finish_wait(&resync_wait, &wq);
 			}
 		}
-	} while (mddev->curr_resync < 2);
+		spin_unlock(&all_mddevs_lock);
+	} while (mddev->curr_resync < MD_RESYNC_DELAYED);
 
 	j = 0;
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
@@ -8876,7 +8891,7 @@ void md_do_sync(struct md_thread *thread)
 			 desc, mdname(mddev));
 		mddev->curr_resync = j;
 	} else
-		mddev->curr_resync = 3; /* no longer delayed */
+		mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */
 	mddev->curr_resync_completed = j;
 	sysfs_notify_dirent_safe(mddev->sysfs_completed);
 	md_new_event();
@@ -9011,14 +9026,14 @@ void md_do_sync(struct md_thread *thread)
 
 	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
-	    mddev->curr_resync > 3) {
+	    mddev->curr_resync >= MD_RESYNC_ACTIVE) {
 		mddev->curr_resync_completed = mddev->curr_resync;
 		sysfs_notify_dirent_safe(mddev->sysfs_completed);
 	}
 	mddev->pers->sync_request(mddev, max_sectors, &skipped);
 
 	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
-	    mddev->curr_resync > 3) {
+	    mddev->curr_resync >= MD_RESYNC_ACTIVE) {
 		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
 				if (mddev->curr_resync >= mddev->recovery_cp) {
@@ -9082,7 +9097,7 @@ void md_do_sync(struct md_thread *thread)
 	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
 		mddev->resync_min = mddev->curr_resync_completed;
 	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
-	mddev->curr_resync = 0;
+	mddev->curr_resync = MD_RESYNC_NONE;
 	spin_unlock(&mddev->lock);
 
 	wake_up(&resync_wait);
@@ -9303,6 +9318,7 @@ void md_check_recovery(struct mddev *mddev)
 			 * ->spare_active and clear saved_raid_disk
 			 */
 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+			md_unregister_thread(&mddev->sync_thread);
 			md_reap_sync_thread(mddev);
 			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -9338,6 +9354,7 @@ void md_check_recovery(struct mddev *mddev)
 			goto unlock;
 		}
 		if (mddev->sync_thread) {
+			md_unregister_thread(&mddev->sync_thread);
 			md_reap_sync_thread(mddev);
 			goto unlock;
 		}
@@ -9417,8 +9434,7 @@ void md_reap_sync_thread(struct mddev *mddev)
 	sector_t old_dev_sectors = mddev->dev_sectors;
 	bool is_reshaped = false;
 
-	/* resync has finished, collect result */
-	md_unregister_thread(&mddev->sync_thread);
+	/* sync_thread should be unregistered, collect result */
 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
 	    mddev->degraded != mddev->raid_disks) {
@@ -9466,6 +9482,7 @@ void md_reap_sync_thread(struct mddev *mddev)
 	wake_up(&resync_wait);
 	/* flag recovery needed just to double check */
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+	sysfs_notify_dirent_safe(mddev->sysfs_completed);
 	sysfs_notify_dirent_safe(mddev->sysfs_action);
 	md_new_event();
 	if (mddev->event_work.func)
@@ -9544,11 +9561,14 @@ EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
 static int md_notify_reboot(struct notifier_block *this,
 			    unsigned long code, void *x)
 {
-	struct list_head *tmp;
-	struct mddev *mddev;
+	struct mddev *mddev, *n;
 	int need_delay = 0;
 
-	for_each_mddev(mddev, tmp) {
+	spin_lock(&all_mddevs_lock);
+	list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
+		if (!mddev_get(mddev))
+			continue;
+		spin_unlock(&all_mddevs_lock);
 		if (mddev_trylock(mddev)) {
 			if (mddev->pers)
 				__md_stop_writes(mddev);
@@ -9557,7 +9577,11 @@ static int md_notify_reboot(struct notifier_block *this,
 			mddev_unlock(mddev);
 		}
 		need_delay = 1;
+		mddev_put(mddev);
+		spin_lock(&all_mddevs_lock);
 	}
+	spin_unlock(&all_mddevs_lock);
+
 	/*
 	 * certain more exotic SCSI devices are known to be
 	 * volatile wrt too early system reboots. While the
@@ -9876,8 +9900,7 @@ void md_autostart_arrays(int part)
 
 static __exit void md_exit(void)
 {
-	struct mddev *mddev;
-	struct list_head *tmp;
+	struct mddev *mddev, *n;
 	int delay = 1;
 
 	unregister_blkdev(MD_MAJOR,"md");
@@ -9897,17 +9920,24 @@ static __exit void md_exit(void)
 	}
 	remove_proc_entry("mdstat", NULL);
 
-	for_each_mddev(mddev, tmp) {
+	spin_lock(&all_mddevs_lock);
+	list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
+		if (!mddev_get(mddev))
+			continue;
+		spin_unlock(&all_mddevs_lock);
 		export_array(mddev);
 		mddev->ctime = 0;
 		mddev->hold_active = 0;
 		/*
-		 * for_each_mddev() will call mddev_put() at the end of each
-		 * iteration.  As the mddev is now fully clear, this will
-		 * schedule the mddev for destruction by a workqueue, and the
+		 * As the mddev is now fully clear, mddev_put will schedule
+		 * the mddev for destruction by a workqueue, and the
 		 * destroy_workqueue() below will wait for that to complete.
 		 */
+		mddev_put(mddev);
+		spin_lock(&all_mddevs_lock);
 	}
+	spin_unlock(&all_mddevs_lock);
+
 	destroy_workqueue(md_rdev_misc_wq);
 	destroy_workqueue(md_misc_wq);
 	destroy_workqueue(md_wq);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index b4f84b27bdef..b4e2d8b87b61 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -254,6 +254,7 @@ struct md_cluster_info;
  * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that
  *		   array is ready yet.
  * @MD_BROKEN: This is used to stop writes and mark array as failed.
+ * @MD_DELETED: This device is being deleted
  *
  * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added
  */
@@ -270,6 +271,7 @@ enum mddev_flags {
 	MD_UPDATING_SB,
 	MD_NOT_READY,
 	MD_BROKEN,
+	MD_DELETED,
 };
 
 enum mddev_sb_flags {
@@ -288,6 +290,21 @@ struct serial_info {
 	sector_t _subtree_last; /* highest sector in subtree of rb node */
 };
 
+/*
+ * mddev->curr_resync stores the current sector of the resync but
+ * also has some overloaded values.
+ */
+enum {
+	/* No resync in progress */
+	MD_RESYNC_NONE = 0,
+	/* Yielded to allow another conflicting resync to commence */
+	MD_RESYNC_YIELDED = 1,
+	/* Delayed to check that there is no conflict with another sync */
+	MD_RESYNC_DELAYED = 2,
+	/* Any value greater than or equal to this is in an active resync */
+	MD_RESYNC_ACTIVE = 3,
+};
+
 struct mddev {
 	void				*private;
 	struct md_personality		*pers;
@@ -750,6 +767,8 @@ extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
 
 extern void mddev_init(struct mddev *mddev);
+struct mddev *md_alloc(dev_t dev, char *name);
+void mddev_put(struct mddev *mddev);
 extern int md_run(struct mddev *mddev);
 extern int md_start(struct mddev *mddev);
 extern void md_stop(struct mddev *mddev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 26545950ca42..9117fcdee1be 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2167,9 +2167,12 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	int err = 0;
 	int number = rdev->raid_disk;
 	struct md_rdev **rdevp;
-	struct raid10_info *p = conf->mirrors + number;
+	struct raid10_info *p;
 
 	print_conf(conf);
+	if (unlikely(number >= mddev->raid_disks))
+		return 0;
+	p = conf->mirrors + number;
 	if (rdev == p->rdev)
 		rdevp = &p->rdev;
 	else if (rdev == p->replacement)
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 6f2dd73128b0..f4e1cc1ece43 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1590,18 +1590,13 @@ void r5l_quiesce(struct r5l_log *log, int quiesce)
 
 bool r5l_log_disk_error(struct r5conf *conf)
 {
-	struct r5l_log *log;
-	bool ret;
-	/* don't allow write if journal disk is missing */
-	rcu_read_lock();
-	log = rcu_dereference(conf->log);
+	struct r5l_log *log = conf->log;
 
+	/* don't allow write if journal disk is missing */
 	if (!log)
-		ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
+		return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
 	else
-		ret = test_bit(Faulty, &log->rdev->flags);
-	rcu_read_unlock();
-	return ret;
+		return test_bit(Faulty, &log->rdev->flags);
 }
 
 #define R5L_RECOVERY_PAGE_POOL_SIZE 256
@@ -2534,12 +2529,13 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
 	struct r5conf *conf;
 	int ret;
 
-	spin_lock(&mddev->lock);
+	ret = mddev_lock(mddev);
+	if (ret)
+		return ret;
+
 	conf = mddev->private;
-	if (!conf || !conf->log) {
-		spin_unlock(&mddev->lock);
-		return 0;
-	}
+	if (!conf || !conf->log)
+		goto out_unlock;
 
 	switch (conf->log->r5c_journal_mode) {
 	case R5C_JOURNAL_MODE_WRITE_THROUGH:
@@ -2557,7 +2553,9 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
 	default:
 		ret = 0;
 	}
-	spin_unlock(&mddev->lock);
+
+out_unlock:
+	mddev_unlock(mddev);
 	return ret;
 }
 
@@ -2639,7 +2637,7 @@ int r5c_try_caching_write(struct r5conf *conf,
 	int i;
 	struct r5dev *dev;
 	int to_cache = 0;
-	void **pslot;
+	void __rcu **pslot;
 	sector_t tree_index;
 	int ret;
 	uintptr_t refcount;
@@ -2806,7 +2804,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
 	int i;
 	int do_wakeup = 0;
 	sector_t tree_index;
-	void **pslot;
+	void __rcu **pslot;
 	uintptr_t refcount;
 
 	if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
@@ -3145,7 +3143,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	spin_lock_init(&log->stripe_in_journal_lock);
 	atomic_set(&log->stripe_in_journal_count, 0);
 
-	rcu_assign_pointer(conf->log, log);
+	conf->log = log;
 
 	set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
 	return 0;
@@ -3167,13 +3165,13 @@ void r5l_exit_log(struct r5conf *conf)
 {
 	struct r5l_log *log = conf->log;
 
-	conf->log = NULL;
-	synchronize_rcu();
-
 	/* Ensure disable_writeback_work wakes up and exits */
 	wake_up(&conf->mddev->sb_wait);
 	flush_work(&log->disable_writeback_work);
 	md_unregister_thread(&log->reclaim_thread);
+
+	conf->log = NULL;
+
 	mempool_exit(&log->meta_pool);
 	bioset_exit(&log->bs);
 	mempool_exit(&log->io_pool);
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 43c714a8798c..c8332502669e 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -2,49 +2,46 @@
 #ifndef _RAID5_LOG_H
 #define _RAID5_LOG_H
 
-extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
-extern void r5l_exit_log(struct r5conf *conf);
-extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
-extern void r5l_write_stripe_run(struct r5l_log *log);
-extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
-extern void r5l_stripe_write_finished(struct stripe_head *sh);
-extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
-extern void r5l_quiesce(struct r5l_log *log, int quiesce);
-extern bool r5l_log_disk_error(struct r5conf *conf);
-extern bool r5c_is_writeback(struct r5l_log *log);
-extern int
-r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
-		      struct stripe_head_state *s, int disks);
-extern void
-r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
-			    struct stripe_head_state *s);
-extern void r5c_release_extra_page(struct stripe_head *sh);
-extern void r5c_use_extra_page(struct stripe_head *sh);
-extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
-extern void r5c_handle_cached_data_endio(struct r5conf *conf,
-	struct stripe_head *sh, int disks);
-extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh);
-extern void r5c_make_stripe_write_out(struct stripe_head *sh);
-extern void r5c_flush_cache(struct r5conf *conf, int num);
-extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
-extern void r5c_check_cached_full_stripe(struct r5conf *conf);
+int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
+void r5l_exit_log(struct r5conf *conf);
+int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
+void r5l_write_stripe_run(struct r5l_log *log);
+void r5l_flush_stripe_to_raid(struct r5l_log *log);
+void r5l_stripe_write_finished(struct stripe_head *sh);
+int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
+void r5l_quiesce(struct r5l_log *log, int quiesce);
+bool r5l_log_disk_error(struct r5conf *conf);
+bool r5c_is_writeback(struct r5l_log *log);
+int r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
+			  struct stripe_head_state *s, int disks);
+void r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
+				 struct stripe_head_state *s);
+void r5c_release_extra_page(struct stripe_head *sh);
+void r5c_use_extra_page(struct stripe_head *sh);
+void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+void r5c_handle_cached_data_endio(struct r5conf *conf,
+				  struct stripe_head *sh, int disks);
+int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh);
+void r5c_make_stripe_write_out(struct stripe_head *sh);
+void r5c_flush_cache(struct r5conf *conf, int num);
+void r5c_check_stripe_cache_usage(struct r5conf *conf);
+void r5c_check_cached_full_stripe(struct r5conf *conf);
 extern struct md_sysfs_entry r5c_journal_mode;
-extern void r5c_update_on_rdev_error(struct mddev *mddev,
-				     struct md_rdev *rdev);
-extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
-extern int r5l_start(struct r5l_log *log);
+void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev);
+bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
+int r5l_start(struct r5l_log *log);
 
-extern struct dma_async_tx_descriptor *
+struct dma_async_tx_descriptor *
 ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
 		       struct dma_async_tx_descriptor *tx);
-extern int ppl_init_log(struct r5conf *conf);
-extern void ppl_exit_log(struct r5conf *conf);
-extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
-extern void ppl_write_stripe_run(struct r5conf *conf);
-extern void ppl_stripe_write_finished(struct stripe_head *sh);
-extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
-extern void ppl_quiesce(struct r5conf *conf, int quiesce);
-extern int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio);
+int ppl_init_log(struct r5conf *conf);
+void ppl_exit_log(struct r5conf *conf);
+int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
+void ppl_write_stripe_run(struct r5conf *conf);
+void ppl_stripe_write_finished(struct stripe_head *sh);
+int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
+void ppl_quiesce(struct r5conf *conf, int quiesce);
+int ppl_handle_flush_request(struct bio *bio);
 extern struct md_sysfs_entry ppl_write_hint;
 
 static inline bool raid5_has_log(struct r5conf *conf)
@@ -111,7 +108,7 @@ static inline int log_handle_flush_request(struct r5conf *conf, struct bio *bio)
 	if (conf->log)
 		ret = r5l_handle_flush_request(conf->log, bio);
 	else if (raid5_has_ppl(conf))
-		ret = ppl_handle_flush_request(conf->log, bio);
+		ret = ppl_handle_flush_request(bio);
 
 	return ret;
 }
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 98988cb26295..31b9157bc9ae 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -679,7 +679,7 @@ void ppl_quiesce(struct r5conf *conf, int quiesce)
 	}
 }
 
-int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio)
+int ppl_handle_flush_request(struct bio *bio)
 {
 	if (bio->bi_iter.bi_size == 0) {
 		bio_endio(bio);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5cabdbbac48b..860c45c10a57 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -61,6 +61,8 @@
 #define cpu_to_group(cpu) cpu_to_node(cpu)
 #define ANY_GROUP NUMA_NO_NODE
 
+#define RAID5_MAX_REQ_STRIPES 256
+
 static bool devices_handle_discard_safely = false;
 module_param(devices_handle_discard_safely, bool, 0644);
 MODULE_PARM_DESC(devices_handle_discard_safely,
@@ -624,6 +626,49 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
 	return NULL;
 }
 
+static struct stripe_head *find_get_stripe(struct r5conf *conf,
+		sector_t sector, short generation, int hash)
+{
+	int inc_empty_inactive_list_flag;
+	struct stripe_head *sh;
+
+	sh = __find_stripe(conf, sector, generation);
+	if (!sh)
+		return NULL;
+
+	if (atomic_inc_not_zero(&sh->count))
+		return sh;
+
+	/*
+	 * Slow path. The reference count is zero which means the stripe must
+	 * be on a list (sh->lru). Must remove the stripe from the list that
+	 * references it with the device_lock held.
+	 */
+
+	spin_lock(&conf->device_lock);
+	if (!atomic_read(&sh->count)) {
+		if (!test_bit(STRIPE_HANDLE, &sh->state))
+			atomic_inc(&conf->active_stripes);
+		BUG_ON(list_empty(&sh->lru) &&
+		       !test_bit(STRIPE_EXPANDING, &sh->state));
+		inc_empty_inactive_list_flag = 0;
+		if (!list_empty(conf->inactive_list + hash))
+			inc_empty_inactive_list_flag = 1;
+		list_del_init(&sh->lru);
+		if (list_empty(conf->inactive_list + hash) &&
+		    inc_empty_inactive_list_flag)
+			atomic_inc(&conf->empty_inactive_list_nr);
+		if (sh->group) {
+			sh->group->stripes_cnt--;
+			sh->group = NULL;
+		}
+	}
+	atomic_inc(&sh->count);
+	spin_unlock(&conf->device_lock);
+
+	return sh;
+}
+
 /*
  * Need to check if array has failed when deciding whether to:
  *  - start an array
@@ -710,80 +755,121 @@ static bool has_failed(struct r5conf *conf)
 	return degraded > conf->max_degraded;
 }
 
-struct stripe_head *
-raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
-			int previous, int noblock, int noquiesce)
+enum stripe_result {
+	STRIPE_SUCCESS = 0,
+	STRIPE_RETRY,
+	STRIPE_SCHEDULE_AND_RETRY,
+	STRIPE_FAIL,
+};
+
+struct stripe_request_ctx {
+	/* a reference to the last stripe_head for batching */
+	struct stripe_head *batch_last;
+
+	/* first sector in the request */
+	sector_t first_sector;
+
+	/* last sector in the request */
+	sector_t last_sector;
+
+	/*
+	 * bitmap to track stripe sectors that have been added to stripes
+	 * add one to account for unaligned requests
+	 */
+	DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + 1);
+
+	/* the request had REQ_PREFLUSH, cleared after the first stripe_head */
+	bool do_flush;
+};
+
+/*
+ * Block until another thread clears R5_INACTIVE_BLOCKED or
+ * there are fewer than 3/4 the maximum number of active stripes
+ * and there is an inactive stripe available.
+ */
+static bool is_inactive_blocked(struct r5conf *conf, int hash)
+{
+	int active = atomic_read(&conf->active_stripes);
+
+	if (list_empty(conf->inactive_list + hash))
+		return false;
+
+	if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
+		return true;
+
+	return active < (conf->max_nr_stripes * 3 / 4);
+}
+
+static struct stripe_head *__raid5_get_active_stripe(struct r5conf *conf,
+		struct stripe_request_ctx *ctx, sector_t sector,
+		bool previous, bool noblock, bool noquiesce)
 {
 	struct stripe_head *sh;
 	int hash = stripe_hash_locks_hash(conf, sector);
-	int inc_empty_inactive_list_flag;
 
 	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
 
 	spin_lock_irq(conf->hash_locks + hash);
 
-	do {
-		wait_event_lock_irq(conf->wait_for_quiescent,
-				    conf->quiesce == 0 || noquiesce,
-				    *(conf->hash_locks + hash));
-		sh = __find_stripe(conf, sector, conf->generation - previous);
-		if (!sh) {
-			if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
-				sh = get_free_stripe(conf, hash);
-				if (!sh && !test_bit(R5_DID_ALLOC,
-						     &conf->cache_state))
-					set_bit(R5_ALLOC_MORE,
-						&conf->cache_state);
-			}
-			if (noblock && sh == NULL)
-				break;
-
-			r5c_check_stripe_cache_usage(conf);
-			if (!sh) {
-				set_bit(R5_INACTIVE_BLOCKED,
-					&conf->cache_state);
-				r5l_wake_reclaim(conf->log, 0);
-				wait_event_lock_irq(
-					conf->wait_for_stripe,
-					!list_empty(conf->inactive_list + hash) &&
-					(atomic_read(&conf->active_stripes)
-					 < (conf->max_nr_stripes * 3 / 4)
-					 || !test_bit(R5_INACTIVE_BLOCKED,
-						      &conf->cache_state)),
-					*(conf->hash_locks + hash));
-				clear_bit(R5_INACTIVE_BLOCKED,
-					  &conf->cache_state);
-			} else {
-				init_stripe(sh, sector, previous);
-				atomic_inc(&sh->count);
-			}
-		} else if (!atomic_inc_not_zero(&sh->count)) {
-			spin_lock(&conf->device_lock);
-			if (!atomic_read(&sh->count)) {
-				if (!test_bit(STRIPE_HANDLE, &sh->state))
-					atomic_inc(&conf->active_stripes);
-				BUG_ON(list_empty(&sh->lru) &&
-				       !test_bit(STRIPE_EXPANDING, &sh->state));
-				inc_empty_inactive_list_flag = 0;
-				if (!list_empty(conf->inactive_list + hash))
-					inc_empty_inactive_list_flag = 1;
-				list_del_init(&sh->lru);
-				if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
-					atomic_inc(&conf->empty_inactive_list_nr);
-				if (sh->group) {
-					sh->group->stripes_cnt--;
-					sh->group = NULL;
-				}
-			}
-			atomic_inc(&sh->count);
-			spin_unlock(&conf->device_lock);
+retry:
+	if (!noquiesce && conf->quiesce) {
+		/*
+		 * Must release the reference to batch_last before waiting,
+		 * on quiesce, otherwise the batch_last will hold a reference
+		 * to a stripe and raid5_quiesce() will deadlock waiting for
+		 * active_stripes to go to zero.
+		 */
+		if (ctx && ctx->batch_last) {
+			raid5_release_stripe(ctx->batch_last);
+			ctx->batch_last = NULL;
 		}
-	} while (sh == NULL);
 
+		wait_event_lock_irq(conf->wait_for_quiescent, !conf->quiesce,
+				    *(conf->hash_locks + hash));
+	}
+
+	sh = find_get_stripe(conf, sector, conf->generation - previous, hash);
+	if (sh)
+		goto out;
+
+	if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
+		goto wait_for_stripe;
+
+	sh = get_free_stripe(conf, hash);
+	if (sh) {
+		r5c_check_stripe_cache_usage(conf);
+		init_stripe(sh, sector, previous);
+		atomic_inc(&sh->count);
+		goto out;
+	}
+
+	if (!test_bit(R5_DID_ALLOC, &conf->cache_state))
+		set_bit(R5_ALLOC_MORE, &conf->cache_state);
+
+wait_for_stripe:
+	if (noblock)
+		goto out;
+
+	set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
+	r5l_wake_reclaim(conf->log, 0);
+	wait_event_lock_irq(conf->wait_for_stripe,
+			    is_inactive_blocked(conf, hash),
+			    *(conf->hash_locks + hash));
+	clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
+	goto retry;
+
+out:
 	spin_unlock_irq(conf->hash_locks + hash);
 	return sh;
 }
 
+struct stripe_head *raid5_get_active_stripe(struct r5conf *conf,
+		sector_t sector, bool previous, bool noblock, bool noquiesce)
+{
+	return __raid5_get_active_stripe(conf, NULL, sector, previous, noblock,
+					 noquiesce);
+}
+
 static bool is_full_stripe_write(struct stripe_head *sh)
 {
 	BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
@@ -824,13 +910,13 @@ static bool stripe_can_batch(struct stripe_head *sh)
 }
 
 /* we only do back search */
-static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
+static void stripe_add_to_batch_list(struct r5conf *conf,
+		struct stripe_head *sh, struct stripe_head *last_sh)
 {
 	struct stripe_head *head;
 	sector_t head_sector, tmp_sec;
 	int hash;
 	int dd_idx;
-	int inc_empty_inactive_list_flag;
 
 	/* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
 	tmp_sec = sh->sector;
@@ -838,36 +924,20 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
 		return;
 	head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
 
-	hash = stripe_hash_locks_hash(conf, head_sector);
-	spin_lock_irq(conf->hash_locks + hash);
-	head = __find_stripe(conf, head_sector, conf->generation);
-	if (head && !atomic_inc_not_zero(&head->count)) {
-		spin_lock(&conf->device_lock);
-		if (!atomic_read(&head->count)) {
-			if (!test_bit(STRIPE_HANDLE, &head->state))
-				atomic_inc(&conf->active_stripes);
-			BUG_ON(list_empty(&head->lru) &&
-			       !test_bit(STRIPE_EXPANDING, &head->state));
-			inc_empty_inactive_list_flag = 0;
-			if (!list_empty(conf->inactive_list + hash))
-				inc_empty_inactive_list_flag = 1;
-			list_del_init(&head->lru);
-			if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
-				atomic_inc(&conf->empty_inactive_list_nr);
-			if (head->group) {
-				head->group->stripes_cnt--;
-				head->group = NULL;
-			}
-		}
+	if (last_sh && head_sector == last_sh->sector) {
+		head = last_sh;
 		atomic_inc(&head->count);
-		spin_unlock(&conf->device_lock);
+	} else {
+		hash = stripe_hash_locks_hash(conf, head_sector);
+		spin_lock_irq(conf->hash_locks + hash);
+		head = find_get_stripe(conf, head_sector, conf->generation,
+				       hash);
+		spin_unlock_irq(conf->hash_locks + hash);
+		if (!head)
+			return;
+		if (!stripe_can_batch(head))
+			goto out;
 	}
-	spin_unlock_irq(conf->hash_locks + hash);
-
-	if (!head)
-		return;
-	if (!stripe_can_batch(head))
-		goto out;
 
 	lock_two_stripes(head, sh);
 	/* clear_batch_ready clear the flag */
@@ -2882,10 +2952,10 @@ static void raid5_end_write_request(struct bio *bi)
 	if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
 		clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
-	raid5_release_stripe(sh);
 
 	if (sh->batch_head && sh != sh->batch_head)
 		raid5_release_stripe(sh->batch_head);
+	raid5_release_stripe(sh);
 }
 
 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
@@ -3413,39 +3483,32 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 		s->locked, s->ops_request);
 }
 
-/*
- * Each stripe/dev can have one or more bion attached.
- * toread/towrite point to the first in a chain.
- * The bi_next chain must be in order.
- */
-static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
-			  int forwrite, int previous)
+static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi,
+				int dd_idx, int forwrite)
 {
-	struct bio **bip;
 	struct r5conf *conf = sh->raid_conf;
-	int firstwrite=0;
+	struct bio **bip;
 
-	pr_debug("adding bi b#%llu to stripe s#%llu\n",
-		(unsigned long long)bi->bi_iter.bi_sector,
-		(unsigned long long)sh->sector);
+	pr_debug("checking bi b#%llu to stripe s#%llu\n",
+		 bi->bi_iter.bi_sector, sh->sector);
 
-	spin_lock_irq(&sh->stripe_lock);
 	/* Don't allow new IO added to stripes in batch list */
 	if (sh->batch_head)
-		goto overlap;
-	if (forwrite) {
+		return true;
+
+	if (forwrite)
 		bip = &sh->dev[dd_idx].towrite;
-		if (*bip == NULL)
-			firstwrite = 1;
-	} else
+	else
 		bip = &sh->dev[dd_idx].toread;
+
 	while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
 		if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
-			goto overlap;
-		bip = & (*bip)->bi_next;
+			return true;
+		bip = &(*bip)->bi_next;
 	}
+
 	if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
-		goto overlap;
+		return true;
 
 	if (forwrite && raid5_has_ppl(conf)) {
 		/*
@@ -3474,9 +3537,30 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 		}
 
 		if (first + conf->chunk_sectors * (count - 1) != last)
-			goto overlap;
+			return true;
 	}
 
+	return false;
+}
+
+static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
+			     int dd_idx, int forwrite, int previous)
+{
+	struct r5conf *conf = sh->raid_conf;
+	struct bio **bip;
+	int firstwrite = 0;
+
+	if (forwrite) {
+		bip = &sh->dev[dd_idx].towrite;
+		if (!*bip)
+			firstwrite = 1;
+	} else {
+		bip = &sh->dev[dd_idx].toread;
+	}
+
+	while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector)
+		bip = &(*bip)->bi_next;
+
 	if (!forwrite || previous)
 		clear_bit(STRIPE_BATCH_READY, &sh->state);
 
@@ -3502,9 +3586,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 				sh->overwrite_disks++;
 	}
 
-	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
-		(unsigned long long)(*bip)->bi_iter.bi_sector,
-		(unsigned long long)sh->sector, dd_idx);
+	pr_debug("added bi b#%llu to stripe s#%llu, disk %d, logical %llu\n",
+		 (*bip)->bi_iter.bi_sector, sh->sector, dd_idx,
+		 sh->dev[dd_idx].sector);
 
 	if (conf->mddev->bitmap && firstwrite) {
 		/* Cannot hold spinlock over bitmap_startwrite,
@@ -3512,7 +3596,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 		 * we have added to the bitmap and set bm_seq.
 		 * So set STRIPE_BITMAP_PENDING to prevent
 		 * batching.
-		 * If multiple add_stripe_bio() calls race here they
+		 * If multiple __add_stripe_bio() calls race here they
 		 * much all set STRIPE_BITMAP_PENDING.  So only the first one
 		 * to complete "bitmap_startwrite" gets to set
 		 * STRIPE_BIT_DELAY.  This is important as once a stripe
@@ -3530,16 +3614,27 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 			set_bit(STRIPE_BIT_DELAY, &sh->state);
 		}
 	}
-	spin_unlock_irq(&sh->stripe_lock);
+}
 
-	if (stripe_can_batch(sh))
-		stripe_add_to_batch_list(conf, sh);
-	return 1;
+/*
+ * Each stripe/dev can have one or more bios attached.
+ * toread/towrite point to the first in a chain.
+ * The bi_next chain must be in order.
+ */
+static bool add_stripe_bio(struct stripe_head *sh, struct bio *bi,
+			   int dd_idx, int forwrite, int previous)
+{
+	spin_lock_irq(&sh->stripe_lock);
 
- overlap:
-	set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
+	if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
+		set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
+		spin_unlock_irq(&sh->stripe_lock);
+		return false;
+	}
+
+	__add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
 	spin_unlock_irq(&sh->stripe_lock);
-	return 0;
+	return true;
 }
 
 static void end_reshape(struct r5conf *conf);
@@ -5785,17 +5880,215 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 	bio_endio(bi);
 }
 
+static bool ahead_of_reshape(struct mddev *mddev, sector_t sector,
+			     sector_t reshape_sector)
+{
+	return mddev->reshape_backwards ? sector < reshape_sector :
+					  sector >= reshape_sector;
+}
+
+static bool range_ahead_of_reshape(struct mddev *mddev, sector_t min,
+				   sector_t max, sector_t reshape_sector)
+{
+	return mddev->reshape_backwards ? max < reshape_sector :
+					  min >= reshape_sector;
+}
+
+static bool stripe_ahead_of_reshape(struct mddev *mddev, struct r5conf *conf,
+				    struct stripe_head *sh)
+{
+	sector_t max_sector = 0, min_sector = MaxSector;
+	bool ret = false;
+	int dd_idx;
+
+	for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
+		if (dd_idx == sh->pd_idx)
+			continue;
+
+		min_sector = min(min_sector, sh->dev[dd_idx].sector);
+		max_sector = min(max_sector, sh->dev[dd_idx].sector);
+	}
+
+	spin_lock_irq(&conf->device_lock);
+
+	if (!range_ahead_of_reshape(mddev, min_sector, max_sector,
+				     conf->reshape_progress))
+		/* mismatch, need to try again */
+		ret = true;
+
+	spin_unlock_irq(&conf->device_lock);
+
+	return ret;
+}
+
+static int add_all_stripe_bios(struct r5conf *conf,
+		struct stripe_request_ctx *ctx, struct stripe_head *sh,
+		struct bio *bi, int forwrite, int previous)
+{
+	int dd_idx;
+	int ret = 1;
+
+	spin_lock_irq(&sh->stripe_lock);
+
+	for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
+		struct r5dev *dev = &sh->dev[dd_idx];
+
+		if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
+			continue;
+
+		if (dev->sector < ctx->first_sector ||
+		    dev->sector >= ctx->last_sector)
+			continue;
+
+		if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
+			set_bit(R5_Overlap, &dev->flags);
+			ret = 0;
+			continue;
+		}
+	}
+
+	if (!ret)
+		goto out;
+
+	for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
+		struct r5dev *dev = &sh->dev[dd_idx];
+
+		if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
+			continue;
+
+		if (dev->sector < ctx->first_sector ||
+		    dev->sector >= ctx->last_sector)
+			continue;
+
+		__add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
+		clear_bit((dev->sector - ctx->first_sector) >>
+			  RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do);
+	}
+
+out:
+	spin_unlock_irq(&sh->stripe_lock);
+	return ret;
+}
+
+static enum stripe_result make_stripe_request(struct mddev *mddev,
+		struct r5conf *conf, struct stripe_request_ctx *ctx,
+		sector_t logical_sector, struct bio *bi)
+{
+	const int rw = bio_data_dir(bi);
+	enum stripe_result ret;
+	struct stripe_head *sh;
+	sector_t new_sector;
+	int previous = 0;
+	int seq, dd_idx;
+
+	seq = read_seqcount_begin(&conf->gen_lock);
+
+	if (unlikely(conf->reshape_progress != MaxSector)) {
+		/*
+		 * Spinlock is needed as reshape_progress may be
+		 * 64bit on a 32bit platform, and so it might be
+		 * possible to see a half-updated value
+		 * Of course reshape_progress could change after
+		 * the lock is dropped, so once we get a reference
+		 * to the stripe that we think it is, we will have
+		 * to check again.
+		 */
+		spin_lock_irq(&conf->device_lock);
+		if (ahead_of_reshape(mddev, logical_sector,
+				     conf->reshape_progress)) {
+			previous = 1;
+		} else {
+			if (ahead_of_reshape(mddev, logical_sector,
+					     conf->reshape_safe)) {
+				spin_unlock_irq(&conf->device_lock);
+				return STRIPE_SCHEDULE_AND_RETRY;
+			}
+		}
+		spin_unlock_irq(&conf->device_lock);
+	}
+
+	new_sector = raid5_compute_sector(conf, logical_sector, previous,
+					  &dd_idx, NULL);
+	pr_debug("raid456: %s, sector %llu logical %llu\n", __func__,
+		 new_sector, logical_sector);
+
+	sh = __raid5_get_active_stripe(conf, ctx, new_sector, previous,
+				       (bi->bi_opf & REQ_RAHEAD), 0);
+	if (unlikely(!sh)) {
+		/* cannot get stripe, just give-up */
+		bi->bi_status = BLK_STS_IOERR;
+		return STRIPE_FAIL;
+	}
+
+	if (unlikely(previous) &&
+	    stripe_ahead_of_reshape(mddev, conf, sh)) {
+		/*
+		 * Expansion moved on while waiting for a stripe.
+		 * Expansion could still move past after this
+		 * test, but as we are holding a reference to
+		 * 'sh', we know that if that happens,
+		 *  STRIPE_EXPANDING will get set and the expansion
+		 * won't proceed until we finish with the stripe.
+		 */
+		ret = STRIPE_SCHEDULE_AND_RETRY;
+		goto out_release;
+	}
+
+	if (read_seqcount_retry(&conf->gen_lock, seq)) {
+		/* Might have got the wrong stripe_head by accident */
+		ret = STRIPE_RETRY;
+		goto out_release;
+	}
+
+	if (test_bit(STRIPE_EXPANDING, &sh->state) ||
+	    !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) {
+		/*
+		 * Stripe is busy expanding or add failed due to
+		 * overlap. Flush everything and wait a while.
+		 */
+		md_wakeup_thread(mddev->thread);
+		ret = STRIPE_SCHEDULE_AND_RETRY;
+		goto out_release;
+	}
+
+	if (stripe_can_batch(sh)) {
+		stripe_add_to_batch_list(conf, sh, ctx->batch_last);
+		if (ctx->batch_last)
+			raid5_release_stripe(ctx->batch_last);
+		atomic_inc(&sh->count);
+		ctx->batch_last = sh;
+	}
+
+	if (ctx->do_flush) {
+		set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
+		/* we only need flush for one stripe */
+		ctx->do_flush = false;
+	}
+
+	set_bit(STRIPE_HANDLE, &sh->state);
+	clear_bit(STRIPE_DELAYED, &sh->state);
+	if ((!sh->batch_head || sh == sh->batch_head) &&
+	    (bi->bi_opf & REQ_SYNC) &&
+	    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+		atomic_inc(&conf->preread_active_stripes);
+
+	release_stripe_plug(mddev, sh);
+	return STRIPE_SUCCESS;
+
+out_release:
+	raid5_release_stripe(sh);
+	return ret;
+}
+
 static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
 {
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 	struct r5conf *conf = mddev->private;
-	int dd_idx;
-	sector_t new_sector;
-	sector_t logical_sector, last_sector;
-	struct stripe_head *sh;
+	sector_t logical_sector;
+	struct stripe_request_ctx ctx = {};
 	const int rw = bio_data_dir(bi);
-	DEFINE_WAIT(w);
-	bool do_prepare;
-	bool do_flush = false;
+	enum stripe_result res;
+	int s, stripe_cnt;
 
 	if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
 		int ret = log_handle_flush_request(conf, bi);
@@ -5811,7 +6104,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
 		 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
 		 * we need to flush journal device
 		 */
-		do_flush = bi->bi_opf & REQ_PREFLUSH;
+		ctx.do_flush = bi->bi_opf & REQ_PREFLUSH;
 	}
 
 	if (!md_write_start(mddev, bi))
@@ -5835,134 +6128,68 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
 	}
 
 	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
-	last_sector = bio_end_sector(bi);
+	ctx.first_sector = logical_sector;
+	ctx.last_sector = bio_end_sector(bi);
 	bi->bi_next = NULL;
 
+	stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector,
+					   RAID5_STRIPE_SECTORS(conf));
+	bitmap_set(ctx.sectors_to_do, 0, stripe_cnt);
+
+	pr_debug("raid456: %s, logical %llu to %llu\n", __func__,
+		 bi->bi_iter.bi_sector, ctx.last_sector);
+
 	/* Bail out if conflicts with reshape and REQ_NOWAIT is set */
 	if ((bi->bi_opf & REQ_NOWAIT) &&
 	    (conf->reshape_progress != MaxSector) &&
-	    (mddev->reshape_backwards
-	    ? (logical_sector > conf->reshape_progress && logical_sector <= conf->reshape_safe)
-	    : (logical_sector >= conf->reshape_safe && logical_sector < conf->reshape_progress))) {
+	    !ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) &&
+	    ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) {
 		bio_wouldblock_error(bi);
 		if (rw == WRITE)
 			md_write_end(mddev);
 		return true;
 	}
 	md_account_bio(mddev, &bi);
-	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
-	for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
-		int previous;
-		int seq;
 
-		do_prepare = false;
-	retry:
-		seq = read_seqcount_begin(&conf->gen_lock);
-		previous = 0;
-		if (do_prepare)
-			prepare_to_wait(&conf->wait_for_overlap, &w,
-				TASK_UNINTERRUPTIBLE);
-		if (unlikely(conf->reshape_progress != MaxSector)) {
-			/* spinlock is needed as reshape_progress may be
-			 * 64bit on a 32bit platform, and so it might be
-			 * possible to see a half-updated value
-			 * Of course reshape_progress could change after
-			 * the lock is dropped, so once we get a reference
-			 * to the stripe that we think it is, we will have
-			 * to check again.
-			 */
-			spin_lock_irq(&conf->device_lock);
-			if (mddev->reshape_backwards
-			    ? logical_sector < conf->reshape_progress
-			    : logical_sector >= conf->reshape_progress) {
-				previous = 1;
-			} else {
-				if (mddev->reshape_backwards
-				    ? logical_sector < conf->reshape_safe
-				    : logical_sector >= conf->reshape_safe) {
-					spin_unlock_irq(&conf->device_lock);
-					schedule();
-					do_prepare = true;
-					goto retry;
-				}
-			}
-			spin_unlock_irq(&conf->device_lock);
-		}
-
-		new_sector = raid5_compute_sector(conf, logical_sector,
-						  previous,
-						  &dd_idx, NULL);
-		pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n",
-			(unsigned long long)new_sector,
-			(unsigned long long)logical_sector);
-
-		sh = raid5_get_active_stripe(conf, new_sector, previous,
-				       (bi->bi_opf & REQ_RAHEAD), 0);
-		if (sh) {
-			if (unlikely(previous)) {
-				/* expansion might have moved on while waiting for a
-				 * stripe, so we must do the range check again.
-				 * Expansion could still move past after this
-				 * test, but as we are holding a reference to
-				 * 'sh', we know that if that happens,
-				 *  STRIPE_EXPANDING will get set and the expansion
-				 * won't proceed until we finish with the stripe.
-				 */
-				int must_retry = 0;
-				spin_lock_irq(&conf->device_lock);
-				if (mddev->reshape_backwards
-				    ? logical_sector >= conf->reshape_progress
-				    : logical_sector < conf->reshape_progress)
-					/* mismatch, need to try again */
-					must_retry = 1;
-				spin_unlock_irq(&conf->device_lock);
-				if (must_retry) {
-					raid5_release_stripe(sh);
-					schedule();
-					do_prepare = true;
-					goto retry;
-				}
-			}
-			if (read_seqcount_retry(&conf->gen_lock, seq)) {
-				/* Might have got the wrong stripe_head
-				 * by accident
-				 */
-				raid5_release_stripe(sh);
-				goto retry;
-			}
-
-			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
-			    !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
-				/* Stripe is busy expanding or
-				 * add failed due to overlap.  Flush everything
-				 * and wait a while
-				 */
-				md_wakeup_thread(mddev->thread);
-				raid5_release_stripe(sh);
-				schedule();
-				do_prepare = true;
-				goto retry;
-			}
-			if (do_flush) {
-				set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
-				/* we only need flush for one stripe */
-				do_flush = false;
-			}
-
-			set_bit(STRIPE_HANDLE, &sh->state);
-			clear_bit(STRIPE_DELAYED, &sh->state);
-			if ((!sh->batch_head || sh == sh->batch_head) &&
-			    (bi->bi_opf & REQ_SYNC) &&
-			    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-				atomic_inc(&conf->preread_active_stripes);
-			release_stripe_plug(mddev, sh);
-		} else {
-			/* cannot get stripe for read-ahead, just give-up */
-			bi->bi_status = BLK_STS_IOERR;
+	add_wait_queue(&conf->wait_for_overlap, &wait);
+	while (1) {
+		res = make_stripe_request(mddev, conf, &ctx, logical_sector,
+					  bi);
+		if (res == STRIPE_FAIL)
 			break;
+
+		if (res == STRIPE_RETRY)
+			continue;
+
+		if (res == STRIPE_SCHEDULE_AND_RETRY) {
+			/*
+			 * Must release the reference to batch_last before
+			 * scheduling and waiting for work to be done,
+			 * otherwise the batch_last stripe head could prevent
+			 * raid5_activate_delayed() from making progress
+			 * and thus deadlocking.
+			 */
+			if (ctx.batch_last) {
+				raid5_release_stripe(ctx.batch_last);
+				ctx.batch_last = NULL;
+			}
+
+			wait_woken(&wait, TASK_UNINTERRUPTIBLE,
+				   MAX_SCHEDULE_TIMEOUT);
+			continue;
 		}
+
+		s = find_first_bit(ctx.sectors_to_do, stripe_cnt);
+		if (s == stripe_cnt)
+			break;
+
+		logical_sector = ctx.first_sector +
+			(s << RAID5_STRIPE_SHIFT(conf));
 	}
-	finish_wait(&conf->wait_for_overlap, &w);
+	remove_wait_queue(&conf->wait_for_overlap, &wait);
+
+	if (ctx.batch_last)
+		raid5_release_stripe(ctx.batch_last);
 
 	if (rw == WRITE)
 		md_write_end(mddev);
@@ -7815,7 +8042,15 @@ static int raid5_run(struct mddev *mddev)
 		    mddev->queue->limits.discard_granularity < stripe)
 			blk_queue_max_discard_sectors(mddev->queue, 0);
 
-		blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
+		/*
+		 * Requests require having a bitmap for each stripe.
+		 * Limit the max sectors based on this.
+		 */
+		blk_queue_max_hw_sectors(mddev->queue,
+			RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf));
+
+		/* No restrictions on the number of segments in the request */
+		blk_queue_max_segments(mddev->queue, USHRT_MAX);
 	}
 
 	if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
@@ -8066,8 +8301,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	 * find the disk ... but prefer rdev->saved_raid_disk
 	 * if possible.
 	 */
-	if (rdev->saved_raid_disk >= 0 &&
-	    rdev->saved_raid_disk >= first &&
+	if (rdev->saved_raid_disk >= first &&
 	    rdev->saved_raid_disk <= last &&
 	    conf->disks[rdev->saved_raid_disk].rdev == NULL)
 		first = rdev->saved_raid_disk;
@@ -8704,8 +8938,11 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
 			err = log_init(conf, NULL, true);
 			if (!err) {
 				err = resize_stripes(conf, conf->pool_size);
-				if (err)
+				if (err) {
+					mddev_suspend(mddev);
 					log_exit(conf);
+					mddev_resume(mddev);
+				}
 			}
 		} else
 			err = -EINVAL;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 638d29863503..a5082bed83c8 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -812,7 +812,7 @@ extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
 				     struct stripe_head *sh);
 extern struct stripe_head *
 raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
-			int previous, int noblock, int noquiesce);
+			bool previous, bool noblock, bool noquiesce);
 extern int raid5_calc_degraded(struct r5conf *conf);
 extern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode);
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 735dc805dad7..e735e19461ba 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -50,10 +50,12 @@ static int mlx5_cmd_stub_update_root_ft(struct mlx5_flow_root_namespace *ns,
 
 static int mlx5_cmd_stub_create_flow_table(struct mlx5_flow_root_namespace *ns,
 					   struct mlx5_flow_table *ft,
-					   unsigned int size,
+					   struct mlx5_flow_table_attr *ft_attr,
 					   struct mlx5_flow_table *next_ft)
 {
-	ft->max_fte = size ? roundup_pow_of_two(size) : 1;
+	int max_fte = ft_attr->max_fte;
+
+	ft->max_fte = max_fte ? roundup_pow_of_two(max_fte) : 1;
 
 	return 0;
 }
@@ -258,7 +260,7 @@ static int mlx5_cmd_update_root_ft(struct mlx5_flow_root_namespace *ns,
 
 static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns,
 				      struct mlx5_flow_table *ft,
-				      unsigned int size,
+				      struct mlx5_flow_table_attr *ft_attr,
 				      struct mlx5_flow_table *next_ft)
 {
 	int en_encap = !!(ft->flags & MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT);
@@ -267,17 +269,19 @@ static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns,
 	u32 out[MLX5_ST_SZ_DW(create_flow_table_out)] = {};
 	u32 in[MLX5_ST_SZ_DW(create_flow_table_in)] = {};
 	struct mlx5_core_dev *dev = ns->dev;
+	unsigned int size;
 	int err;
 
-	if (size != POOL_NEXT_SIZE)
-		size = roundup_pow_of_two(size);
-	size = mlx5_ft_pool_get_avail_sz(dev, ft->type, size);
+	if (ft_attr->max_fte != POOL_NEXT_SIZE)
+		size = roundup_pow_of_two(ft_attr->max_fte);
+	size = mlx5_ft_pool_get_avail_sz(dev, ft->type, ft_attr->max_fte);
 	if (!size)
 		return -ENOSPC;
 
 	MLX5_SET(create_flow_table_in, in, opcode,
 		 MLX5_CMD_OP_CREATE_FLOW_TABLE);
 
+	MLX5_SET(create_flow_table_in, in, uid, ft_attr->uid);
 	MLX5_SET(create_flow_table_in, in, table_type, ft->type);
 	MLX5_SET(create_flow_table_in, in, flow_table_context.level, ft->level);
 	MLX5_SET(create_flow_table_in, in, flow_table_context.log_size, size ? ilog2(size) : 0);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
index 274004e80f03..8ef4254b9ea1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
@@ -38,7 +38,7 @@
 struct mlx5_flow_cmds {
 	int (*create_flow_table)(struct mlx5_flow_root_namespace *ns,
 				 struct mlx5_flow_table *ft,
-				 unsigned int size,
+				 struct mlx5_flow_table_attr *ft_attr,
 				 struct mlx5_flow_table *next_ft);
 	int (*destroy_flow_table)(struct mlx5_flow_root_namespace *ns,
 				  struct mlx5_flow_table *ft);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index f1b908d40fa6..e3960cdf5131 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1155,7 +1155,7 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa
 			      find_next_chained_ft(fs_prio);
 	ft->def_miss_action = ns->def_miss_action;
 	ft->ns = ns;
-	err = root->cmds->create_flow_table(root, ft, ft_attr->max_fte, next_ft);
+	err = root->cmds->create_flow_table(root, ft, ft_attr, next_ft);
 	if (err)
 		goto free_ft;
 
@@ -1195,6 +1195,12 @@ struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 }
 EXPORT_SYMBOL(mlx5_create_flow_table);
 
+u32 mlx5_flow_table_id(struct mlx5_flow_table *ft)
+{
+	return ft->id;
+}
+EXPORT_SYMBOL(mlx5_flow_table_id);
+
 struct mlx5_flow_table *
 mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns,
 			     struct mlx5_flow_table_attr *ft_attr, u16 vport)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
index 223c8741b7ae..16d65fe4f654 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
@@ -439,6 +439,7 @@ int mlx5dr_cmd_create_flow_table(struct mlx5_core_dev *mdev,
 
 	MLX5_SET(create_flow_table_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_TABLE);
 	MLX5_SET(create_flow_table_in, in, table_type, attr->table_type);
+	MLX5_SET(create_flow_table_in, in, uid, attr->uid);
 
 	ft_mdev = MLX5_ADDR_OF(create_flow_table_in, in, flow_table_context);
 	MLX5_SET(flow_table_context, ft_mdev, termination_table, attr->term_tbl);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c
index e5f6412baea9..31d443dd8386 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c
@@ -214,7 +214,7 @@ static int dr_table_destroy_sw_owned_tbl(struct mlx5dr_table *tbl)
 					     tbl->table_type);
 }
 
-static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl)
+static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl, u16 uid)
 {
 	bool en_encap = !!(tbl->flags & MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT);
 	bool en_decap = !!(tbl->flags & MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
@@ -236,6 +236,7 @@ static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl)
 	ft_attr.sw_owner = true;
 	ft_attr.decap_en = en_decap;
 	ft_attr.reformat_en = en_encap;
+	ft_attr.uid = uid;
 
 	ret = mlx5dr_cmd_create_flow_table(tbl->dmn->mdev, &ft_attr,
 					   NULL, &tbl->table_id);
@@ -243,7 +244,8 @@ static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl)
 	return ret;
 }
 
-struct mlx5dr_table *mlx5dr_table_create(struct mlx5dr_domain *dmn, u32 level, u32 flags)
+struct mlx5dr_table *mlx5dr_table_create(struct mlx5dr_domain *dmn, u32 level,
+					 u32 flags, u16 uid)
 {
 	struct mlx5dr_table *tbl;
 	int ret;
@@ -263,7 +265,7 @@ struct mlx5dr_table *mlx5dr_table_create(struct mlx5dr_domain *dmn, u32 level, u
 	if (ret)
 		goto free_tbl;
 
-	ret = dr_table_create_sw_owned_tbl(tbl);
+	ret = dr_table_create_sw_owned_tbl(tbl, uid);
 	if (ret)
 		goto uninit_tbl;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
index feed227944b7..062c7c74a1f3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
@@ -1217,6 +1217,7 @@ struct mlx5dr_cmd_query_flow_table_details {
 
 struct mlx5dr_cmd_create_flow_table_attr {
 	u32 table_type;
+	u16 uid;
 	u64 icm_addr_rx;
 	u64 icm_addr_tx;
 	u8 level;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
index 75672663359d..13b6d4721e17 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
@@ -62,7 +62,7 @@ static int set_miss_action(struct mlx5_flow_root_namespace *ns,
 
 static int mlx5_cmd_dr_create_flow_table(struct mlx5_flow_root_namespace *ns,
 					 struct mlx5_flow_table *ft,
-					 unsigned int size,
+					 struct mlx5_flow_table_attr *ft_attr,
 					 struct mlx5_flow_table *next_ft)
 {
 	struct mlx5dr_table *tbl;
@@ -71,7 +71,7 @@ static int mlx5_cmd_dr_create_flow_table(struct mlx5_flow_root_namespace *ns,
 
 	if (mlx5_dr_is_fw_table(ft->flags))
 		return mlx5_fs_cmd_get_fw_cmds()->create_flow_table(ns, ft,
-								    size,
+								    ft_attr,
 								    next_ft);
 	flags = ft->flags;
 	/* turn off encap/decap if not supported for sw-str by fw */
@@ -79,7 +79,8 @@ static int mlx5_cmd_dr_create_flow_table(struct mlx5_flow_root_namespace *ns,
 		flags = ft->flags & ~(MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT |
 				      MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
 
-	tbl = mlx5dr_table_create(ns->fs_dr_domain.dr_domain, ft->level, flags);
+	tbl = mlx5dr_table_create(ns->fs_dr_domain.dr_domain, ft->level, flags,
+				  ft_attr->uid);
 	if (!tbl) {
 		mlx5_core_err(ns->dev, "Failed creating dr flow_table\n");
 		return -EINVAL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
index ecffc2cbe6fe..226a0d7bb06d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
@@ -51,7 +51,8 @@ void mlx5dr_domain_set_peer(struct mlx5dr_domain *dmn,
 			    struct mlx5dr_domain *peer_dmn);
 
 struct mlx5dr_table *
-mlx5dr_table_create(struct mlx5dr_domain *domain, u32 level, u32 flags);
+mlx5dr_table_create(struct mlx5dr_domain *domain, u32 level, u32 flags,
+		    u16 uid);
 
 struct mlx5dr_table *
 mlx5dr_table_get_from_fs_ft(struct mlx5_flow_table *ft);
diff --git a/drivers/nvme/Kconfig b/drivers/nvme/Kconfig
index 87ae409a32b9..656e46d938da 100644
--- a/drivers/nvme/Kconfig
+++ b/drivers/nvme/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 menu "NVME Support"
 
+source "drivers/nvme/common/Kconfig"
 source "drivers/nvme/host/Kconfig"
 source "drivers/nvme/target/Kconfig"
 
diff --git a/drivers/nvme/Makefile b/drivers/nvme/Makefile
index fb42c44609a8..eedca8c72098 100644
--- a/drivers/nvme/Makefile
+++ b/drivers/nvme/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
+obj-$(CONFIG_NVME_COMMON)		+= common/
 obj-y		+= host/
 obj-y		+= target/
diff --git a/drivers/nvme/common/Kconfig b/drivers/nvme/common/Kconfig
new file mode 100644
index 000000000000..4514f44362dd
--- /dev/null
+++ b/drivers/nvme/common/Kconfig
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config NVME_COMMON
+       tristate
diff --git a/drivers/nvme/common/Makefile b/drivers/nvme/common/Makefile
new file mode 100644
index 000000000000..720c625b8a52
--- /dev/null
+++ b/drivers/nvme/common/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-y			+= -I$(src)
+
+obj-$(CONFIG_NVME_COMMON)	+= nvme-common.o
+
+nvme-common-y			+= auth.o
diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c
new file mode 100644
index 000000000000..04bd28f17dcc
--- /dev/null
+++ b/drivers/nvme/common/auth.c
@@ -0,0 +1,483 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Hannes Reinecke, SUSE Linux
+ */
+
+#include <linux/module.h>
+#include <linux/crc32.h>
+#include <linux/base64.h>
+#include <linux/prandom.h>
+#include <linux/scatterlist.h>
+#include <asm/unaligned.h>
+#include <crypto/hash.h>
+#include <crypto/dh.h>
+#include <linux/nvme.h>
+#include <linux/nvme-auth.h>
+
+static u32 nvme_dhchap_seqnum;
+static DEFINE_MUTEX(nvme_dhchap_mutex);
+
+u32 nvme_auth_get_seqnum(void)
+{
+	u32 seqnum;
+
+	mutex_lock(&nvme_dhchap_mutex);
+	if (!nvme_dhchap_seqnum)
+		nvme_dhchap_seqnum = prandom_u32();
+	else {
+		nvme_dhchap_seqnum++;
+		if (!nvme_dhchap_seqnum)
+			nvme_dhchap_seqnum++;
+	}
+	seqnum = nvme_dhchap_seqnum;
+	mutex_unlock(&nvme_dhchap_mutex);
+	return seqnum;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_get_seqnum);
+
+static struct nvme_auth_dhgroup_map {
+	const char name[16];
+	const char kpp[16];
+} dhgroup_map[] = {
+	[NVME_AUTH_DHGROUP_NULL] = {
+		.name = "null", .kpp = "null" },
+	[NVME_AUTH_DHGROUP_2048] = {
+		.name = "ffdhe2048", .kpp = "ffdhe2048(dh)" },
+	[NVME_AUTH_DHGROUP_3072] = {
+		.name = "ffdhe3072", .kpp = "ffdhe3072(dh)" },
+	[NVME_AUTH_DHGROUP_4096] = {
+		.name = "ffdhe4096", .kpp = "ffdhe4096(dh)" },
+	[NVME_AUTH_DHGROUP_6144] = {
+		.name = "ffdhe6144", .kpp = "ffdhe6144(dh)" },
+	[NVME_AUTH_DHGROUP_8192] = {
+		.name = "ffdhe8192", .kpp = "ffdhe8192(dh)" },
+};
+
+const char *nvme_auth_dhgroup_name(u8 dhgroup_id)
+{
+	if (dhgroup_id >= ARRAY_SIZE(dhgroup_map))
+		return NULL;
+	return dhgroup_map[dhgroup_id].name;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_name);
+
+const char *nvme_auth_dhgroup_kpp(u8 dhgroup_id)
+{
+	if (dhgroup_id >= ARRAY_SIZE(dhgroup_map))
+		return NULL;
+	return dhgroup_map[dhgroup_id].kpp;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_kpp);
+
+u8 nvme_auth_dhgroup_id(const char *dhgroup_name)
+{
+	int i;
+
+	if (!dhgroup_name || !strlen(dhgroup_name))
+		return NVME_AUTH_DHGROUP_INVALID;
+	for (i = 0; i < ARRAY_SIZE(dhgroup_map); i++) {
+		if (!strlen(dhgroup_map[i].name))
+			continue;
+		if (!strncmp(dhgroup_map[i].name, dhgroup_name,
+			     strlen(dhgroup_map[i].name)))
+			return i;
+	}
+	return NVME_AUTH_DHGROUP_INVALID;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_id);
+
+static struct nvme_dhchap_hash_map {
+	int len;
+	const char hmac[15];
+	const char digest[8];
+} hash_map[] = {
+	[NVME_AUTH_HASH_SHA256] = {
+		.len = 32,
+		.hmac = "hmac(sha256)",
+		.digest = "sha256",
+	},
+	[NVME_AUTH_HASH_SHA384] = {
+		.len = 48,
+		.hmac = "hmac(sha384)",
+		.digest = "sha384",
+	},
+	[NVME_AUTH_HASH_SHA512] = {
+		.len = 64,
+		.hmac = "hmac(sha512)",
+		.digest = "sha512",
+	},
+};
+
+const char *nvme_auth_hmac_name(u8 hmac_id)
+{
+	if (hmac_id >= ARRAY_SIZE(hash_map))
+		return NULL;
+	return hash_map[hmac_id].hmac;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_hmac_name);
+
+const char *nvme_auth_digest_name(u8 hmac_id)
+{
+	if (hmac_id >= ARRAY_SIZE(hash_map))
+		return NULL;
+	return hash_map[hmac_id].digest;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_digest_name);
+
+u8 nvme_auth_hmac_id(const char *hmac_name)
+{
+	int i;
+
+	if (!hmac_name || !strlen(hmac_name))
+		return NVME_AUTH_HASH_INVALID;
+
+	for (i = 0; i < ARRAY_SIZE(hash_map); i++) {
+		if (!strlen(hash_map[i].hmac))
+			continue;
+		if (!strncmp(hash_map[i].hmac, hmac_name,
+			     strlen(hash_map[i].hmac)))
+			return i;
+	}
+	return NVME_AUTH_HASH_INVALID;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_hmac_id);
+
+size_t nvme_auth_hmac_hash_len(u8 hmac_id)
+{
+	if (hmac_id >= ARRAY_SIZE(hash_map))
+		return 0;
+	return hash_map[hmac_id].len;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_hmac_hash_len);
+
+struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
+					      u8 key_hash)
+{
+	struct nvme_dhchap_key *key;
+	unsigned char *p;
+	u32 crc;
+	int ret, key_len;
+	size_t allocated_len = strlen(secret);
+
+	/* Secret might be affixed with a ':' */
+	p = strrchr(secret, ':');
+	if (p)
+		allocated_len = p - secret;
+	key = kzalloc(sizeof(*key), GFP_KERNEL);
+	if (!key)
+		return ERR_PTR(-ENOMEM);
+	key->key = kzalloc(allocated_len, GFP_KERNEL);
+	if (!key->key) {
+		ret = -ENOMEM;
+		goto out_free_key;
+	}
+
+	key_len = base64_decode(secret, allocated_len, key->key);
+	if (key_len < 0) {
+		pr_debug("base64 key decoding error %d\n",
+			 key_len);
+		ret = key_len;
+		goto out_free_secret;
+	}
+
+	if (key_len != 36 && key_len != 52 &&
+	    key_len != 68) {
+		pr_err("Invalid key len %d\n", key_len);
+		ret = -EINVAL;
+		goto out_free_secret;
+	}
+
+	if (key_hash > 0 &&
+	    (key_len - 4) != nvme_auth_hmac_hash_len(key_hash)) {
+		pr_err("Mismatched key len %d for %s\n", key_len,
+		       nvme_auth_hmac_name(key_hash));
+		ret = -EINVAL;
+		goto out_free_secret;
+	}
+
+	/* The last four bytes is the CRC in little-endian format */
+	key_len -= 4;
+	/*
+	 * The linux implementation doesn't do pre- and post-increments,
+	 * so we have to do it manually.
+	 */
+	crc = ~crc32(~0, key->key, key_len);
+
+	if (get_unaligned_le32(key->key + key_len) != crc) {
+		pr_err("key crc mismatch (key %08x, crc %08x)\n",
+		       get_unaligned_le32(key->key + key_len), crc);
+		ret = -EKEYREJECTED;
+		goto out_free_secret;
+	}
+	key->len = key_len;
+	key->hash = key_hash;
+	return key;
+out_free_secret:
+	kfree_sensitive(key->key);
+out_free_key:
+	kfree(key);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_extract_key);
+
+void nvme_auth_free_key(struct nvme_dhchap_key *key)
+{
+	if (!key)
+		return;
+	kfree_sensitive(key->key);
+	kfree(key);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_free_key);
+
+u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn)
+{
+	const char *hmac_name;
+	struct crypto_shash *key_tfm;
+	struct shash_desc *shash;
+	u8 *transformed_key;
+	int ret;
+
+	if (!key || !key->key) {
+		pr_warn("No key specified\n");
+		return ERR_PTR(-ENOKEY);
+	}
+	if (key->hash == 0) {
+		transformed_key = kmemdup(key->key, key->len, GFP_KERNEL);
+		return transformed_key ? transformed_key : ERR_PTR(-ENOMEM);
+	}
+	hmac_name = nvme_auth_hmac_name(key->hash);
+	if (!hmac_name) {
+		pr_warn("Invalid key hash id %d\n", key->hash);
+		return ERR_PTR(-EINVAL);
+	}
+
+	key_tfm = crypto_alloc_shash(hmac_name, 0, 0);
+	if (IS_ERR(key_tfm))
+		return (u8 *)key_tfm;
+
+	shash = kmalloc(sizeof(struct shash_desc) +
+			crypto_shash_descsize(key_tfm),
+			GFP_KERNEL);
+	if (!shash) {
+		ret = -ENOMEM;
+		goto out_free_key;
+	}
+
+	transformed_key = kzalloc(crypto_shash_digestsize(key_tfm), GFP_KERNEL);
+	if (!transformed_key) {
+		ret = -ENOMEM;
+		goto out_free_shash;
+	}
+
+	shash->tfm = key_tfm;
+	ret = crypto_shash_setkey(key_tfm, key->key, key->len);
+	if (ret < 0)
+		goto out_free_transformed_key;
+	ret = crypto_shash_init(shash);
+	if (ret < 0)
+		goto out_free_transformed_key;
+	ret = crypto_shash_update(shash, nqn, strlen(nqn));
+	if (ret < 0)
+		goto out_free_transformed_key;
+	ret = crypto_shash_update(shash, "NVMe-over-Fabrics", 17);
+	if (ret < 0)
+		goto out_free_transformed_key;
+	ret = crypto_shash_final(shash, transformed_key);
+	if (ret < 0)
+		goto out_free_transformed_key;
+
+	kfree(shash);
+	crypto_free_shash(key_tfm);
+
+	return transformed_key;
+
+out_free_transformed_key:
+	kfree_sensitive(transformed_key);
+out_free_shash:
+	kfree(shash);
+out_free_key:
+	crypto_free_shash(key_tfm);
+
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_transform_key);
+
+static int nvme_auth_hash_skey(int hmac_id, u8 *skey, size_t skey_len, u8 *hkey)
+{
+	const char *digest_name;
+	struct crypto_shash *tfm;
+	int ret;
+
+	digest_name = nvme_auth_digest_name(hmac_id);
+	if (!digest_name) {
+		pr_debug("%s: failed to get digest for %d\n", __func__,
+			 hmac_id);
+		return -EINVAL;
+	}
+	tfm = crypto_alloc_shash(digest_name, 0, 0);
+	if (IS_ERR(tfm))
+		return -ENOMEM;
+
+	ret = crypto_shash_tfm_digest(tfm, skey, skey_len, hkey);
+	if (ret < 0)
+		pr_debug("%s: Failed to hash digest len %zu\n", __func__,
+			 skey_len);
+
+	crypto_free_shash(tfm);
+	return ret;
+}
+
+int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len,
+		u8 *challenge, u8 *aug, size_t hlen)
+{
+	struct crypto_shash *tfm;
+	struct shash_desc *desc;
+	u8 *hashed_key;
+	const char *hmac_name;
+	int ret;
+
+	hashed_key = kmalloc(hlen, GFP_KERNEL);
+	if (!hashed_key)
+		return -ENOMEM;
+
+	ret = nvme_auth_hash_skey(hmac_id, skey,
+				  skey_len, hashed_key);
+	if (ret < 0)
+		goto out_free_key;
+
+	hmac_name = nvme_auth_hmac_name(hmac_id);
+	if (!hmac_name) {
+		pr_warn("%s: invalid hash algorithm %d\n",
+			__func__, hmac_id);
+		ret = -EINVAL;
+		goto out_free_key;
+	}
+
+	tfm = crypto_alloc_shash(hmac_name, 0, 0);
+	if (IS_ERR(tfm)) {
+		ret = PTR_ERR(tfm);
+		goto out_free_key;
+	}
+
+	desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm),
+		       GFP_KERNEL);
+	if (!desc) {
+		ret = -ENOMEM;
+		goto out_free_hash;
+	}
+	desc->tfm = tfm;
+
+	ret = crypto_shash_setkey(tfm, hashed_key, hlen);
+	if (ret)
+		goto out_free_desc;
+
+	ret = crypto_shash_init(desc);
+	if (ret)
+		goto out_free_desc;
+
+	ret = crypto_shash_update(desc, challenge, hlen);
+	if (ret)
+		goto out_free_desc;
+
+	ret = crypto_shash_final(desc, aug);
+out_free_desc:
+	kfree_sensitive(desc);
+out_free_hash:
+	crypto_free_shash(tfm);
+out_free_key:
+	kfree_sensitive(hashed_key);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_augmented_challenge);
+
+int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid)
+{
+	int ret;
+
+	ret = crypto_kpp_set_secret(dh_tfm, NULL, 0);
+	if (ret)
+		pr_debug("failed to set private key, error %d\n", ret);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_gen_privkey);
+
+int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm,
+		u8 *host_key, size_t host_key_len)
+{
+	struct kpp_request *req;
+	struct crypto_wait wait;
+	struct scatterlist dst;
+	int ret;
+
+	req = kpp_request_alloc(dh_tfm, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	crypto_init_wait(&wait);
+	kpp_request_set_input(req, NULL, 0);
+	sg_init_one(&dst, host_key, host_key_len);
+	kpp_request_set_output(req, &dst, host_key_len);
+	kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				 crypto_req_done, &wait);
+
+	ret = crypto_wait_req(crypto_kpp_generate_public_key(req), &wait);
+	kpp_request_free(req);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_gen_pubkey);
+
+int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm,
+		u8 *ctrl_key, size_t ctrl_key_len,
+		u8 *sess_key, size_t sess_key_len)
+{
+	struct kpp_request *req;
+	struct crypto_wait wait;
+	struct scatterlist src, dst;
+	int ret;
+
+	req = kpp_request_alloc(dh_tfm, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	crypto_init_wait(&wait);
+	sg_init_one(&src, ctrl_key, ctrl_key_len);
+	kpp_request_set_input(req, &src, ctrl_key_len);
+	sg_init_one(&dst, sess_key, sess_key_len);
+	kpp_request_set_output(req, &dst, sess_key_len);
+	kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				 crypto_req_done, &wait);
+
+	ret = crypto_wait_req(crypto_kpp_compute_shared_secret(req), &wait);
+
+	kpp_request_free(req);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_gen_shared_secret);
+
+int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key)
+{
+	struct nvme_dhchap_key *key;
+	u8 key_hash;
+
+	if (!secret) {
+		*ret_key = NULL;
+		return 0;
+	}
+
+	if (sscanf(secret, "DHHC-1:%hhd:%*s:", &key_hash) != 1)
+		return -EINVAL;
+
+	/* Pass in the secret without the 'DHHC-1:XX:' prefix */
+	key = nvme_auth_extract_key(secret + 10, key_hash);
+	if (IS_ERR(key)) {
+		*ret_key = NULL;
+		return PTR_ERR(key);
+	}
+
+	*ret_key = key;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_generate_key);
+
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 877d2ec4ea9f..2f6a7f8c94e8 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -92,6 +92,21 @@ config NVME_TCP
 
 	  If unsure, say N.
 
+config NVME_AUTH
+	bool "NVM Express over Fabrics In-Band Authentication"
+	depends on NVME_CORE
+	select NVME_COMMON
+	select CRYPTO
+	select CRYPTO_HMAC
+	select CRYPTO_SHA256
+	select CRYPTO_SHA512
+	select CRYPTO_DH
+	select CRYPTO_DH_RFC7919_GROUPS
+	help
+	  This provides support for NVMe over Fabrics In-Band Authentication.
+
+	  If unsure, say N.
+
 config NVME_APPLE
 	tristate "Apple ANS2 NVM Express host driver"
 	depends on OF && BLOCK
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index a36ae1612059..e27202d22c7d 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -10,12 +10,14 @@ obj-$(CONFIG_NVME_FC)			+= nvme-fc.o
 obj-$(CONFIG_NVME_TCP)			+= nvme-tcp.o
 obj-$(CONFIG_NVME_APPLE)		+= nvme-apple.o
 
-nvme-core-y				:= core.o ioctl.o constants.o
+nvme-core-y				+= core.o ioctl.o
+nvme-core-$(CONFIG_NVME_VERBOSE_ERRORS)	+= constants.o
 nvme-core-$(CONFIG_TRACING)		+= trace.o
 nvme-core-$(CONFIG_NVME_MULTIPATH)	+= multipath.o
 nvme-core-$(CONFIG_BLK_DEV_ZONED)	+= zns.o
 nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)	+= fault_inject.o
 nvme-core-$(CONFIG_NVME_HWMON)		+= hwmon.o
+nvme-core-$(CONFIG_NVME_AUTH)		+= auth.o
 
 nvme-y					+= pci.o
 
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index 5c352d5d8ee6..5fc5ea196b40 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -845,11 +845,8 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown)
 	apple_nvme_handle_cq(&anv->adminq, true);
 	spin_unlock_irqrestore(&anv->lock, flags);
 
-	blk_mq_tagset_busy_iter(&anv->tagset, nvme_cancel_request, &anv->ctrl);
-	blk_mq_tagset_busy_iter(&anv->admin_tagset, nvme_cancel_request,
-				&anv->ctrl);
-	blk_mq_tagset_wait_completed_request(&anv->tagset);
-	blk_mq_tagset_wait_completed_request(&anv->admin_tagset);
+	nvme_cancel_tagset(&anv->ctrl);
+	nvme_cancel_admin_tagset(&anv->ctrl);
 
 	/*
 	 * The driver will not be starting up queues again if shutting down so
@@ -1222,6 +1219,11 @@ static void apple_nvme_async_probe(void *data, async_cookie_t cookie)
 	nvme_put_ctrl(&anv->ctrl);
 }
 
+static void devm_apple_nvme_put_tag_set(void *data)
+{
+	blk_mq_free_tag_set(data);
+}
+
 static int apple_nvme_alloc_tagsets(struct apple_nvme *anv)
 {
 	int ret;
@@ -1238,8 +1240,7 @@ static int apple_nvme_alloc_tagsets(struct apple_nvme *anv)
 	ret = blk_mq_alloc_tag_set(&anv->admin_tagset);
 	if (ret)
 		return ret;
-	ret = devm_add_action_or_reset(anv->dev,
-				       (void (*)(void *))blk_mq_free_tag_set,
+	ret = devm_add_action_or_reset(anv->dev, devm_apple_nvme_put_tag_set,
 				       &anv->admin_tagset);
 	if (ret)
 		return ret;
@@ -1263,8 +1264,8 @@ static int apple_nvme_alloc_tagsets(struct apple_nvme *anv)
 	ret = blk_mq_alloc_tag_set(&anv->tagset);
 	if (ret)
 		return ret;
-	ret = devm_add_action_or_reset(
-		anv->dev, (void (*)(void *))blk_mq_free_tag_set, &anv->tagset);
+	ret = devm_add_action_or_reset(anv->dev, devm_apple_nvme_put_tag_set,
+					&anv->tagset);
 	if (ret)
 		return ret;
 
@@ -1365,6 +1366,11 @@ static int apple_nvme_attach_genpd(struct apple_nvme *anv)
 	return 0;
 }
 
+static void devm_apple_nvme_mempool_destroy(void *data)
+{
+	mempool_destroy(data);
+}
+
 static int apple_nvme_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -1462,8 +1468,8 @@ static int apple_nvme_probe(struct platform_device *pdev)
 		ret = -ENOMEM;
 		goto put_dev;
 	}
-	ret = devm_add_action_or_reset(
-		anv->dev, (void (*)(void *))mempool_destroy, anv->iod_mempool);
+	ret = devm_add_action_or_reset(anv->dev,
+			devm_apple_nvme_mempool_destroy, anv->iod_mempool);
 	if (ret)
 		goto put_dev;
 
diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c
new file mode 100644
index 000000000000..c8a6db7c4498
--- /dev/null
+++ b/drivers/nvme/host/auth.c
@@ -0,0 +1,1017 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Hannes Reinecke, SUSE Linux
+ */
+
+#include <linux/crc32.h>
+#include <linux/base64.h>
+#include <linux/prandom.h>
+#include <asm/unaligned.h>
+#include <crypto/hash.h>
+#include <crypto/dh.h>
+#include "nvme.h"
+#include "fabrics.h"
+#include <linux/nvme-auth.h>
+
+struct nvme_dhchap_queue_context {
+	struct list_head entry;
+	struct work_struct auth_work;
+	struct nvme_ctrl *ctrl;
+	struct crypto_shash *shash_tfm;
+	struct crypto_kpp *dh_tfm;
+	void *buf;
+	size_t buf_size;
+	int qid;
+	int error;
+	u32 s1;
+	u32 s2;
+	u16 transaction;
+	u8 status;
+	u8 hash_id;
+	size_t hash_len;
+	u8 dhgroup_id;
+	u8 c1[64];
+	u8 c2[64];
+	u8 response[64];
+	u8 *host_response;
+	u8 *ctrl_key;
+	int ctrl_key_len;
+	u8 *host_key;
+	int host_key_len;
+	u8 *sess_key;
+	int sess_key_len;
+};
+
+#define nvme_auth_flags_from_qid(qid) \
+	(qid == 0) ? 0 : BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED
+#define nvme_auth_queue_from_qid(ctrl, qid) \
+	(qid == 0) ? (ctrl)->fabrics_q : (ctrl)->connect_q
+
+static int nvme_auth_submit(struct nvme_ctrl *ctrl, int qid,
+			    void *data, size_t data_len, bool auth_send)
+{
+	struct nvme_command cmd = {};
+	blk_mq_req_flags_t flags = nvme_auth_flags_from_qid(qid);
+	struct request_queue *q = nvme_auth_queue_from_qid(ctrl, qid);
+	int ret;
+
+	cmd.auth_common.opcode = nvme_fabrics_command;
+	cmd.auth_common.secp = NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER;
+	cmd.auth_common.spsp0 = 0x01;
+	cmd.auth_common.spsp1 = 0x01;
+	if (auth_send) {
+		cmd.auth_send.fctype = nvme_fabrics_type_auth_send;
+		cmd.auth_send.tl = cpu_to_le32(data_len);
+	} else {
+		cmd.auth_receive.fctype = nvme_fabrics_type_auth_receive;
+		cmd.auth_receive.al = cpu_to_le32(data_len);
+	}
+
+	ret = __nvme_submit_sync_cmd(q, &cmd, NULL, data, data_len,
+				     qid == 0 ? NVME_QID_ANY : qid,
+				     0, flags);
+	if (ret > 0)
+		dev_warn(ctrl->device,
+			"qid %d auth_send failed with status %d\n", qid, ret);
+	else if (ret < 0)
+		dev_err(ctrl->device,
+			"qid %d auth_send failed with error %d\n", qid, ret);
+	return ret;
+}
+
+static int nvme_auth_receive_validate(struct nvme_ctrl *ctrl, int qid,
+		struct nvmf_auth_dhchap_failure_data *data,
+		u16 transaction, u8 expected_msg)
+{
+	dev_dbg(ctrl->device, "%s: qid %d auth_type %d auth_id %x\n",
+		__func__, qid, data->auth_type, data->auth_id);
+
+	if (data->auth_type == NVME_AUTH_COMMON_MESSAGES &&
+	    data->auth_id == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) {
+		return data->rescode_exp;
+	}
+	if (data->auth_type != NVME_AUTH_DHCHAP_MESSAGES ||
+	    data->auth_id != expected_msg) {
+		dev_warn(ctrl->device,
+			 "qid %d invalid message %02x/%02x\n",
+			 qid, data->auth_type, data->auth_id);
+		return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE;
+	}
+	if (le16_to_cpu(data->t_id) != transaction) {
+		dev_warn(ctrl->device,
+			 "qid %d invalid transaction ID %d\n",
+			 qid, le16_to_cpu(data->t_id));
+		return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE;
+	}
+	return 0;
+}
+
+static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	struct nvmf_auth_dhchap_negotiate_data *data = chap->buf;
+	size_t size = sizeof(*data) + sizeof(union nvmf_auth_protocol);
+
+	if (chap->buf_size < size) {
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return -EINVAL;
+	}
+	memset((u8 *)chap->buf, 0, size);
+	data->auth_type = NVME_AUTH_COMMON_MESSAGES;
+	data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE;
+	data->t_id = cpu_to_le16(chap->transaction);
+	data->sc_c = 0; /* No secure channel concatenation */
+	data->napd = 1;
+	data->auth_protocol[0].dhchap.authid = NVME_AUTH_DHCHAP_AUTH_ID;
+	data->auth_protocol[0].dhchap.halen = 3;
+	data->auth_protocol[0].dhchap.dhlen = 6;
+	data->auth_protocol[0].dhchap.idlist[0] = NVME_AUTH_HASH_SHA256;
+	data->auth_protocol[0].dhchap.idlist[1] = NVME_AUTH_HASH_SHA384;
+	data->auth_protocol[0].dhchap.idlist[2] = NVME_AUTH_HASH_SHA512;
+	data->auth_protocol[0].dhchap.idlist[30] = NVME_AUTH_DHGROUP_NULL;
+	data->auth_protocol[0].dhchap.idlist[31] = NVME_AUTH_DHGROUP_2048;
+	data->auth_protocol[0].dhchap.idlist[32] = NVME_AUTH_DHGROUP_3072;
+	data->auth_protocol[0].dhchap.idlist[33] = NVME_AUTH_DHGROUP_4096;
+	data->auth_protocol[0].dhchap.idlist[34] = NVME_AUTH_DHGROUP_6144;
+	data->auth_protocol[0].dhchap.idlist[35] = NVME_AUTH_DHGROUP_8192;
+
+	return size;
+}
+
+static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	struct nvmf_auth_dhchap_challenge_data *data = chap->buf;
+	u16 dhvlen = le16_to_cpu(data->dhvlen);
+	size_t size = sizeof(*data) + data->hl + dhvlen;
+	const char *gid_name = nvme_auth_dhgroup_name(data->dhgid);
+	const char *hmac_name, *kpp_name;
+
+	if (chap->buf_size < size) {
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return NVME_SC_INVALID_FIELD;
+	}
+
+	hmac_name = nvme_auth_hmac_name(data->hashid);
+	if (!hmac_name) {
+		dev_warn(ctrl->device,
+			 "qid %d: invalid HASH ID %d\n",
+			 chap->qid, data->hashid);
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
+		return NVME_SC_INVALID_FIELD;
+	}
+
+	if (chap->hash_id == data->hashid && chap->shash_tfm &&
+	    !strcmp(crypto_shash_alg_name(chap->shash_tfm), hmac_name) &&
+	    crypto_shash_digestsize(chap->shash_tfm) == data->hl) {
+		dev_dbg(ctrl->device,
+			"qid %d: reuse existing hash %s\n",
+			chap->qid, hmac_name);
+		goto select_kpp;
+	}
+
+	/* Reset if hash cannot be reused */
+	if (chap->shash_tfm) {
+		crypto_free_shash(chap->shash_tfm);
+		chap->hash_id = 0;
+		chap->hash_len = 0;
+	}
+	chap->shash_tfm = crypto_alloc_shash(hmac_name, 0,
+					     CRYPTO_ALG_ALLOCATES_MEMORY);
+	if (IS_ERR(chap->shash_tfm)) {
+		dev_warn(ctrl->device,
+			 "qid %d: failed to allocate hash %s, error %ld\n",
+			 chap->qid, hmac_name, PTR_ERR(chap->shash_tfm));
+		chap->shash_tfm = NULL;
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+		return NVME_SC_AUTH_REQUIRED;
+	}
+
+	if (crypto_shash_digestsize(chap->shash_tfm) != data->hl) {
+		dev_warn(ctrl->device,
+			 "qid %d: invalid hash length %d\n",
+			 chap->qid, data->hl);
+		crypto_free_shash(chap->shash_tfm);
+		chap->shash_tfm = NULL;
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
+		return NVME_SC_AUTH_REQUIRED;
+	}
+
+	/* Reset host response if the hash had been changed */
+	if (chap->hash_id != data->hashid) {
+		kfree(chap->host_response);
+		chap->host_response = NULL;
+	}
+
+	chap->hash_id = data->hashid;
+	chap->hash_len = data->hl;
+	dev_dbg(ctrl->device, "qid %d: selected hash %s\n",
+		chap->qid, hmac_name);
+
+select_kpp:
+	kpp_name = nvme_auth_dhgroup_kpp(data->dhgid);
+	if (!kpp_name) {
+		dev_warn(ctrl->device,
+			 "qid %d: invalid DH group id %d\n",
+			 chap->qid, data->dhgid);
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+		/* Leave previous dh_tfm intact */
+		return NVME_SC_AUTH_REQUIRED;
+	}
+
+	/* Clear host and controller key to avoid accidental reuse */
+	kfree_sensitive(chap->host_key);
+	chap->host_key = NULL;
+	chap->host_key_len = 0;
+	kfree_sensitive(chap->ctrl_key);
+	chap->ctrl_key = NULL;
+	chap->ctrl_key_len = 0;
+
+	if (chap->dhgroup_id == data->dhgid &&
+	    (data->dhgid == NVME_AUTH_DHGROUP_NULL || chap->dh_tfm)) {
+		dev_dbg(ctrl->device,
+			"qid %d: reuse existing DH group %s\n",
+			chap->qid, gid_name);
+		goto skip_kpp;
+	}
+
+	/* Reset dh_tfm if it can't be reused */
+	if (chap->dh_tfm) {
+		crypto_free_kpp(chap->dh_tfm);
+		chap->dh_tfm = NULL;
+	}
+
+	if (data->dhgid != NVME_AUTH_DHGROUP_NULL) {
+		if (dhvlen == 0) {
+			dev_warn(ctrl->device,
+				 "qid %d: empty DH value\n",
+				 chap->qid);
+			chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+			return NVME_SC_INVALID_FIELD;
+		}
+
+		chap->dh_tfm = crypto_alloc_kpp(kpp_name, 0, 0);
+		if (IS_ERR(chap->dh_tfm)) {
+			int ret = PTR_ERR(chap->dh_tfm);
+
+			dev_warn(ctrl->device,
+				 "qid %d: error %d initializing DH group %s\n",
+				 chap->qid, ret, gid_name);
+			chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+			chap->dh_tfm = NULL;
+			return NVME_SC_AUTH_REQUIRED;
+		}
+		dev_dbg(ctrl->device, "qid %d: selected DH group %s\n",
+			chap->qid, gid_name);
+	} else if (dhvlen != 0) {
+		dev_warn(ctrl->device,
+			 "qid %d: invalid DH value for NULL DH\n",
+			 chap->qid);
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return NVME_SC_INVALID_FIELD;
+	}
+	chap->dhgroup_id = data->dhgid;
+
+skip_kpp:
+	chap->s1 = le32_to_cpu(data->seqnum);
+	memcpy(chap->c1, data->cval, chap->hash_len);
+	if (dhvlen) {
+		chap->ctrl_key = kmalloc(dhvlen, GFP_KERNEL);
+		if (!chap->ctrl_key) {
+			chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+			return NVME_SC_AUTH_REQUIRED;
+		}
+		chap->ctrl_key_len = dhvlen;
+		memcpy(chap->ctrl_key, data->cval + chap->hash_len,
+		       dhvlen);
+		dev_dbg(ctrl->device, "ctrl public key %*ph\n",
+			 (int)chap->ctrl_key_len, chap->ctrl_key);
+	}
+
+	return 0;
+}
+
+static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	struct nvmf_auth_dhchap_reply_data *data = chap->buf;
+	size_t size = sizeof(*data);
+
+	size += 2 * chap->hash_len;
+
+	if (chap->host_key_len)
+		size += chap->host_key_len;
+
+	if (chap->buf_size < size) {
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return -EINVAL;
+	}
+
+	memset(chap->buf, 0, size);
+	data->auth_type = NVME_AUTH_DHCHAP_MESSAGES;
+	data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_REPLY;
+	data->t_id = cpu_to_le16(chap->transaction);
+	data->hl = chap->hash_len;
+	data->dhvlen = cpu_to_le16(chap->host_key_len);
+	memcpy(data->rval, chap->response, chap->hash_len);
+	if (ctrl->ctrl_key) {
+		get_random_bytes(chap->c2, chap->hash_len);
+		data->cvalid = 1;
+		chap->s2 = nvme_auth_get_seqnum();
+		memcpy(data->rval + chap->hash_len, chap->c2,
+		       chap->hash_len);
+		dev_dbg(ctrl->device, "%s: qid %d ctrl challenge %*ph\n",
+			__func__, chap->qid, (int)chap->hash_len, chap->c2);
+	} else {
+		memset(chap->c2, 0, chap->hash_len);
+		chap->s2 = 0;
+	}
+	data->seqnum = cpu_to_le32(chap->s2);
+	if (chap->host_key_len) {
+		dev_dbg(ctrl->device, "%s: qid %d host public key %*ph\n",
+			__func__, chap->qid,
+			chap->host_key_len, chap->host_key);
+		memcpy(data->rval + 2 * chap->hash_len, chap->host_key,
+		       chap->host_key_len);
+	}
+
+	return size;
+}
+
+static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	struct nvmf_auth_dhchap_success1_data *data = chap->buf;
+	size_t size = sizeof(*data);
+
+	if (ctrl->ctrl_key)
+		size += chap->hash_len;
+
+	if (chap->buf_size < size) {
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return NVME_SC_INVALID_FIELD;
+	}
+
+	if (data->hl != chap->hash_len) {
+		dev_warn(ctrl->device,
+			 "qid %d: invalid hash length %u\n",
+			 chap->qid, data->hl);
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
+		return NVME_SC_INVALID_FIELD;
+	}
+
+	/* Just print out information for the admin queue */
+	if (chap->qid == 0)
+		dev_info(ctrl->device,
+			 "qid 0: authenticated with hash %s dhgroup %s\n",
+			 nvme_auth_hmac_name(chap->hash_id),
+			 nvme_auth_dhgroup_name(chap->dhgroup_id));
+
+	if (!data->rvalid)
+		return 0;
+
+	/* Validate controller response */
+	if (memcmp(chap->response, data->rval, data->hl)) {
+		dev_dbg(ctrl->device, "%s: qid %d ctrl response %*ph\n",
+			__func__, chap->qid, (int)chap->hash_len, data->rval);
+		dev_dbg(ctrl->device, "%s: qid %d host response %*ph\n",
+			__func__, chap->qid, (int)chap->hash_len,
+			chap->response);
+		dev_warn(ctrl->device,
+			 "qid %d: controller authentication failed\n",
+			 chap->qid);
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+		return NVME_SC_AUTH_REQUIRED;
+	}
+
+	/* Just print out information for the admin queue */
+	if (chap->qid == 0)
+		dev_info(ctrl->device,
+			 "qid 0: controller authenticated\n");
+	return 0;
+}
+
+static int nvme_auth_set_dhchap_success2_data(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	struct nvmf_auth_dhchap_success2_data *data = chap->buf;
+	size_t size = sizeof(*data);
+
+	memset(chap->buf, 0, size);
+	data->auth_type = NVME_AUTH_DHCHAP_MESSAGES;
+	data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2;
+	data->t_id = cpu_to_le16(chap->transaction);
+
+	return size;
+}
+
+static int nvme_auth_set_dhchap_failure2_data(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	struct nvmf_auth_dhchap_failure_data *data = chap->buf;
+	size_t size = sizeof(*data);
+
+	memset(chap->buf, 0, size);
+	data->auth_type = NVME_AUTH_COMMON_MESSAGES;
+	data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_FAILURE2;
+	data->t_id = cpu_to_le16(chap->transaction);
+	data->rescode = NVME_AUTH_DHCHAP_FAILURE_REASON_FAILED;
+	data->rescode_exp = chap->status;
+
+	return size;
+}
+
+static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	SHASH_DESC_ON_STACK(shash, chap->shash_tfm);
+	u8 buf[4], *challenge = chap->c1;
+	int ret;
+
+	dev_dbg(ctrl->device, "%s: qid %d host response seq %u transaction %d\n",
+		__func__, chap->qid, chap->s1, chap->transaction);
+
+	if (!chap->host_response) {
+		chap->host_response = nvme_auth_transform_key(ctrl->host_key,
+						ctrl->opts->host->nqn);
+		if (IS_ERR(chap->host_response)) {
+			ret = PTR_ERR(chap->host_response);
+			chap->host_response = NULL;
+			return ret;
+		}
+	} else {
+		dev_dbg(ctrl->device, "%s: qid %d re-using host response\n",
+			__func__, chap->qid);
+	}
+
+	ret = crypto_shash_setkey(chap->shash_tfm,
+			chap->host_response, ctrl->host_key->len);
+	if (ret) {
+		dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n",
+			 chap->qid, ret);
+		goto out;
+	}
+
+	if (chap->dh_tfm) {
+		challenge = kmalloc(chap->hash_len, GFP_KERNEL);
+		if (!challenge) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = nvme_auth_augmented_challenge(chap->hash_id,
+						    chap->sess_key,
+						    chap->sess_key_len,
+						    chap->c1, challenge,
+						    chap->hash_len);
+		if (ret)
+			goto out;
+	}
+
+	shash->tfm = chap->shash_tfm;
+	ret = crypto_shash_init(shash);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, challenge, chap->hash_len);
+	if (ret)
+		goto out;
+	put_unaligned_le32(chap->s1, buf);
+	ret = crypto_shash_update(shash, buf, 4);
+	if (ret)
+		goto out;
+	put_unaligned_le16(chap->transaction, buf);
+	ret = crypto_shash_update(shash, buf, 2);
+	if (ret)
+		goto out;
+	memset(buf, 0, sizeof(buf));
+	ret = crypto_shash_update(shash, buf, 1);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, "HostHost", 8);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, ctrl->opts->host->nqn,
+				  strlen(ctrl->opts->host->nqn));
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, buf, 1);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, ctrl->opts->subsysnqn,
+			    strlen(ctrl->opts->subsysnqn));
+	if (ret)
+		goto out;
+	ret = crypto_shash_final(shash, chap->response);
+out:
+	if (challenge != chap->c1)
+		kfree(challenge);
+	return ret;
+}
+
+static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	SHASH_DESC_ON_STACK(shash, chap->shash_tfm);
+	u8 *ctrl_response;
+	u8 buf[4], *challenge = chap->c2;
+	int ret;
+
+	ctrl_response = nvme_auth_transform_key(ctrl->ctrl_key,
+				ctrl->opts->subsysnqn);
+	if (IS_ERR(ctrl_response)) {
+		ret = PTR_ERR(ctrl_response);
+		return ret;
+	}
+	ret = crypto_shash_setkey(chap->shash_tfm,
+			ctrl_response, ctrl->ctrl_key->len);
+	if (ret) {
+		dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n",
+			 chap->qid, ret);
+		goto out;
+	}
+
+	if (chap->dh_tfm) {
+		challenge = kmalloc(chap->hash_len, GFP_KERNEL);
+		if (!challenge) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = nvme_auth_augmented_challenge(chap->hash_id,
+						    chap->sess_key,
+						    chap->sess_key_len,
+						    chap->c2, challenge,
+						    chap->hash_len);
+		if (ret)
+			goto out;
+	}
+	dev_dbg(ctrl->device, "%s: qid %d ctrl response seq %u transaction %d\n",
+		__func__, chap->qid, chap->s2, chap->transaction);
+	dev_dbg(ctrl->device, "%s: qid %d challenge %*ph\n",
+		__func__, chap->qid, (int)chap->hash_len, challenge);
+	dev_dbg(ctrl->device, "%s: qid %d subsysnqn %s\n",
+		__func__, chap->qid, ctrl->opts->subsysnqn);
+	dev_dbg(ctrl->device, "%s: qid %d hostnqn %s\n",
+		__func__, chap->qid, ctrl->opts->host->nqn);
+	shash->tfm = chap->shash_tfm;
+	ret = crypto_shash_init(shash);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, challenge, chap->hash_len);
+	if (ret)
+		goto out;
+	put_unaligned_le32(chap->s2, buf);
+	ret = crypto_shash_update(shash, buf, 4);
+	if (ret)
+		goto out;
+	put_unaligned_le16(chap->transaction, buf);
+	ret = crypto_shash_update(shash, buf, 2);
+	if (ret)
+		goto out;
+	memset(buf, 0, 4);
+	ret = crypto_shash_update(shash, buf, 1);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, "Controller", 10);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, ctrl->opts->subsysnqn,
+				  strlen(ctrl->opts->subsysnqn));
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, buf, 1);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, ctrl->opts->host->nqn,
+				  strlen(ctrl->opts->host->nqn));
+	if (ret)
+		goto out;
+	ret = crypto_shash_final(shash, chap->response);
+out:
+	if (challenge != chap->c2)
+		kfree(challenge);
+	kfree(ctrl_response);
+	return ret;
+}
+
+static int nvme_auth_dhchap_exponential(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	int ret;
+
+	if (chap->host_key && chap->host_key_len) {
+		dev_dbg(ctrl->device,
+			"qid %d: reusing host key\n", chap->qid);
+		goto gen_sesskey;
+	}
+	ret = nvme_auth_gen_privkey(chap->dh_tfm, chap->dhgroup_id);
+	if (ret < 0) {
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return ret;
+	}
+
+	chap->host_key_len = crypto_kpp_maxsize(chap->dh_tfm);
+
+	chap->host_key = kzalloc(chap->host_key_len, GFP_KERNEL);
+	if (!chap->host_key) {
+		chap->host_key_len = 0;
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+		return -ENOMEM;
+	}
+	ret = nvme_auth_gen_pubkey(chap->dh_tfm,
+				   chap->host_key, chap->host_key_len);
+	if (ret) {
+		dev_dbg(ctrl->device,
+			"failed to generate public key, error %d\n", ret);
+		kfree(chap->host_key);
+		chap->host_key = NULL;
+		chap->host_key_len = 0;
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return ret;
+	}
+
+gen_sesskey:
+	chap->sess_key_len = chap->host_key_len;
+	chap->sess_key = kmalloc(chap->sess_key_len, GFP_KERNEL);
+	if (!chap->sess_key) {
+		chap->sess_key_len = 0;
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+		return -ENOMEM;
+	}
+
+	ret = nvme_auth_gen_shared_secret(chap->dh_tfm,
+					  chap->ctrl_key, chap->ctrl_key_len,
+					  chap->sess_key, chap->sess_key_len);
+	if (ret) {
+		dev_dbg(ctrl->device,
+			"failed to generate shared secret, error %d\n", ret);
+		kfree_sensitive(chap->sess_key);
+		chap->sess_key = NULL;
+		chap->sess_key_len = 0;
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return ret;
+	}
+	dev_dbg(ctrl->device, "shared secret %*ph\n",
+		(int)chap->sess_key_len, chap->sess_key);
+	return 0;
+}
+
+static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap)
+{
+	kfree_sensitive(chap->host_response);
+	chap->host_response = NULL;
+	kfree_sensitive(chap->host_key);
+	chap->host_key = NULL;
+	chap->host_key_len = 0;
+	kfree_sensitive(chap->ctrl_key);
+	chap->ctrl_key = NULL;
+	chap->ctrl_key_len = 0;
+	kfree_sensitive(chap->sess_key);
+	chap->sess_key = NULL;
+	chap->sess_key_len = 0;
+	chap->status = 0;
+	chap->error = 0;
+	chap->s1 = 0;
+	chap->s2 = 0;
+	chap->transaction = 0;
+	memset(chap->c1, 0, sizeof(chap->c1));
+	memset(chap->c2, 0, sizeof(chap->c2));
+}
+
+static void __nvme_auth_free(struct nvme_dhchap_queue_context *chap)
+{
+	__nvme_auth_reset(chap);
+	if (chap->shash_tfm)
+		crypto_free_shash(chap->shash_tfm);
+	if (chap->dh_tfm)
+		crypto_free_kpp(chap->dh_tfm);
+	kfree_sensitive(chap->ctrl_key);
+	kfree_sensitive(chap->host_key);
+	kfree_sensitive(chap->sess_key);
+	kfree_sensitive(chap->host_response);
+	kfree(chap->buf);
+	kfree(chap);
+}
+
+static void __nvme_auth_work(struct work_struct *work)
+{
+	struct nvme_dhchap_queue_context *chap =
+		container_of(work, struct nvme_dhchap_queue_context, auth_work);
+	struct nvme_ctrl *ctrl = chap->ctrl;
+	size_t tl;
+	int ret = 0;
+
+	chap->transaction = ctrl->transaction++;
+
+	/* DH-HMAC-CHAP Step 1: send negotiate */
+	dev_dbg(ctrl->device, "%s: qid %d send negotiate\n",
+		__func__, chap->qid);
+	ret = nvme_auth_set_dhchap_negotiate_data(ctrl, chap);
+	if (ret < 0) {
+		chap->error = ret;
+		return;
+	}
+	tl = ret;
+	ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true);
+	if (ret) {
+		chap->error = ret;
+		return;
+	}
+
+	/* DH-HMAC-CHAP Step 2: receive challenge */
+	dev_dbg(ctrl->device, "%s: qid %d receive challenge\n",
+		__func__, chap->qid);
+
+	memset(chap->buf, 0, chap->buf_size);
+	ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false);
+	if (ret) {
+		dev_warn(ctrl->device,
+			 "qid %d failed to receive challenge, %s %d\n",
+			 chap->qid, ret < 0 ? "error" : "nvme status", ret);
+		chap->error = ret;
+		return;
+	}
+	ret = nvme_auth_receive_validate(ctrl, chap->qid, chap->buf, chap->transaction,
+					 NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE);
+	if (ret) {
+		chap->status = ret;
+		chap->error = NVME_SC_AUTH_REQUIRED;
+		return;
+	}
+
+	ret = nvme_auth_process_dhchap_challenge(ctrl, chap);
+	if (ret) {
+		/* Invalid challenge parameters */
+		chap->error = ret;
+		goto fail2;
+	}
+
+	if (chap->ctrl_key_len) {
+		dev_dbg(ctrl->device,
+			"%s: qid %d DH exponential\n",
+			__func__, chap->qid);
+		ret = nvme_auth_dhchap_exponential(ctrl, chap);
+		if (ret) {
+			chap->error = ret;
+			goto fail2;
+		}
+	}
+
+	dev_dbg(ctrl->device, "%s: qid %d host response\n",
+		__func__, chap->qid);
+	ret = nvme_auth_dhchap_setup_host_response(ctrl, chap);
+	if (ret) {
+		chap->error = ret;
+		goto fail2;
+	}
+
+	/* DH-HMAC-CHAP Step 3: send reply */
+	dev_dbg(ctrl->device, "%s: qid %d send reply\n",
+		__func__, chap->qid);
+	ret = nvme_auth_set_dhchap_reply_data(ctrl, chap);
+	if (ret < 0) {
+		chap->error = ret;
+		goto fail2;
+	}
+
+	tl = ret;
+	ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true);
+	if (ret) {
+		chap->error = ret;
+		goto fail2;
+	}
+
+	/* DH-HMAC-CHAP Step 4: receive success1 */
+	dev_dbg(ctrl->device, "%s: qid %d receive success1\n",
+		__func__, chap->qid);
+
+	memset(chap->buf, 0, chap->buf_size);
+	ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false);
+	if (ret) {
+		dev_warn(ctrl->device,
+			 "qid %d failed to receive success1, %s %d\n",
+			 chap->qid, ret < 0 ? "error" : "nvme status", ret);
+		chap->error = ret;
+		return;
+	}
+	ret = nvme_auth_receive_validate(ctrl, chap->qid,
+					 chap->buf, chap->transaction,
+					 NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1);
+	if (ret) {
+		chap->status = ret;
+		chap->error = NVME_SC_AUTH_REQUIRED;
+		return;
+	}
+
+	if (ctrl->ctrl_key) {
+		dev_dbg(ctrl->device,
+			"%s: qid %d controller response\n",
+			__func__, chap->qid);
+		ret = nvme_auth_dhchap_setup_ctrl_response(ctrl, chap);
+		if (ret) {
+			chap->error = ret;
+			goto fail2;
+		}
+	}
+
+	ret = nvme_auth_process_dhchap_success1(ctrl, chap);
+	if (ret) {
+		/* Controller authentication failed */
+		chap->error = NVME_SC_AUTH_REQUIRED;
+		goto fail2;
+	}
+
+	if (ctrl->ctrl_key) {
+		/* DH-HMAC-CHAP Step 5: send success2 */
+		dev_dbg(ctrl->device, "%s: qid %d send success2\n",
+			__func__, chap->qid);
+		tl = nvme_auth_set_dhchap_success2_data(ctrl, chap);
+		ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true);
+		if (ret)
+			chap->error = ret;
+	}
+	if (!ret) {
+		chap->error = 0;
+		return;
+	}
+
+fail2:
+	dev_dbg(ctrl->device, "%s: qid %d send failure2, status %x\n",
+		__func__, chap->qid, chap->status);
+	tl = nvme_auth_set_dhchap_failure2_data(ctrl, chap);
+	ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true);
+	/*
+	 * only update error if send failure2 failed and no other
+	 * error had been set during authentication.
+	 */
+	if (ret && !chap->error)
+		chap->error = ret;
+}
+
+int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid)
+{
+	struct nvme_dhchap_queue_context *chap;
+
+	if (!ctrl->host_key) {
+		dev_warn(ctrl->device, "qid %d: no key\n", qid);
+		return -ENOKEY;
+	}
+
+	if (ctrl->opts->dhchap_ctrl_secret && !ctrl->ctrl_key) {
+		dev_warn(ctrl->device, "qid %d: invalid ctrl key\n", qid);
+		return -ENOKEY;
+	}
+
+	mutex_lock(&ctrl->dhchap_auth_mutex);
+	/* Check if the context is already queued */
+	list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
+		WARN_ON(!chap->buf);
+		if (chap->qid == qid) {
+			dev_dbg(ctrl->device, "qid %d: re-using context\n", qid);
+			mutex_unlock(&ctrl->dhchap_auth_mutex);
+			flush_work(&chap->auth_work);
+			__nvme_auth_reset(chap);
+			queue_work(nvme_wq, &chap->auth_work);
+			return 0;
+		}
+	}
+	chap = kzalloc(sizeof(*chap), GFP_KERNEL);
+	if (!chap) {
+		mutex_unlock(&ctrl->dhchap_auth_mutex);
+		return -ENOMEM;
+	}
+	chap->qid = (qid == NVME_QID_ANY) ? 0 : qid;
+	chap->ctrl = ctrl;
+
+	/*
+	 * Allocate a large enough buffer for the entire negotiation:
+	 * 4k should be enough to ffdhe8192.
+	 */
+	chap->buf_size = 4096;
+	chap->buf = kzalloc(chap->buf_size, GFP_KERNEL);
+	if (!chap->buf) {
+		mutex_unlock(&ctrl->dhchap_auth_mutex);
+		kfree(chap);
+		return -ENOMEM;
+	}
+
+	INIT_WORK(&chap->auth_work, __nvme_auth_work);
+	list_add(&chap->entry, &ctrl->dhchap_auth_list);
+	mutex_unlock(&ctrl->dhchap_auth_mutex);
+	queue_work(nvme_wq, &chap->auth_work);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_negotiate);
+
+int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid)
+{
+	struct nvme_dhchap_queue_context *chap;
+	int ret;
+
+	mutex_lock(&ctrl->dhchap_auth_mutex);
+	list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
+		if (chap->qid != qid)
+			continue;
+		mutex_unlock(&ctrl->dhchap_auth_mutex);
+		flush_work(&chap->auth_work);
+		ret = chap->error;
+		return ret;
+	}
+	mutex_unlock(&ctrl->dhchap_auth_mutex);
+	return -ENXIO;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_wait);
+
+void nvme_auth_reset(struct nvme_ctrl *ctrl)
+{
+	struct nvme_dhchap_queue_context *chap;
+
+	mutex_lock(&ctrl->dhchap_auth_mutex);
+	list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
+		mutex_unlock(&ctrl->dhchap_auth_mutex);
+		flush_work(&chap->auth_work);
+		__nvme_auth_reset(chap);
+	}
+	mutex_unlock(&ctrl->dhchap_auth_mutex);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_reset);
+
+static void nvme_dhchap_auth_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl =
+		container_of(work, struct nvme_ctrl, dhchap_auth_work);
+	int ret, q;
+
+	/* Authenticate admin queue first */
+	ret = nvme_auth_negotiate(ctrl, 0);
+	if (ret) {
+		dev_warn(ctrl->device,
+			 "qid 0: error %d setting up authentication\n", ret);
+		return;
+	}
+	ret = nvme_auth_wait(ctrl, 0);
+	if (ret) {
+		dev_warn(ctrl->device,
+			 "qid 0: authentication failed\n");
+		return;
+	}
+
+	for (q = 1; q < ctrl->queue_count; q++) {
+		ret = nvme_auth_negotiate(ctrl, q);
+		if (ret) {
+			dev_warn(ctrl->device,
+				 "qid %d: error %d setting up authentication\n",
+				 q, ret);
+			break;
+		}
+	}
+
+	/*
+	 * Failure is a soft-state; credentials remain valid until
+	 * the controller terminates the connection.
+	 */
+}
+
+void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
+{
+	INIT_LIST_HEAD(&ctrl->dhchap_auth_list);
+	INIT_WORK(&ctrl->dhchap_auth_work, nvme_dhchap_auth_work);
+	mutex_init(&ctrl->dhchap_auth_mutex);
+	if (!ctrl->opts)
+		return;
+	nvme_auth_generate_key(ctrl->opts->dhchap_secret, &ctrl->host_key);
+	nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, &ctrl->ctrl_key);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_init_ctrl);
+
+void nvme_auth_stop(struct nvme_ctrl *ctrl)
+{
+	struct nvme_dhchap_queue_context *chap = NULL, *tmp;
+
+	cancel_work_sync(&ctrl->dhchap_auth_work);
+	mutex_lock(&ctrl->dhchap_auth_mutex);
+	list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry)
+		cancel_work_sync(&chap->auth_work);
+	mutex_unlock(&ctrl->dhchap_auth_mutex);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_stop);
+
+void nvme_auth_free(struct nvme_ctrl *ctrl)
+{
+	struct nvme_dhchap_queue_context *chap = NULL, *tmp;
+
+	mutex_lock(&ctrl->dhchap_auth_mutex);
+	list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) {
+		list_del_init(&chap->entry);
+		flush_work(&chap->auth_work);
+		__nvme_auth_free(chap);
+	}
+	mutex_unlock(&ctrl->dhchap_auth_mutex);
+	if (ctrl->host_key) {
+		nvme_auth_free_key(ctrl->host_key);
+		ctrl->host_key = NULL;
+	}
+	if (ctrl->ctrl_key) {
+		nvme_auth_free_key(ctrl->ctrl_key);
+		ctrl->ctrl_key = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(nvme_auth_free);
diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
index 4910543f00ff..e958d5015585 100644
--- a/drivers/nvme/host/constants.c
+++ b/drivers/nvme/host/constants.c
@@ -6,7 +6,6 @@
 
 #include "nvme.h"
 
-#ifdef CONFIG_NVME_VERBOSE_ERRORS
 static const char * const nvme_ops[] = {
 	[nvme_cmd_flush] = "Flush",
 	[nvme_cmd_write] = "Write",
@@ -178,6 +177,7 @@ const unsigned char *nvme_get_opcode_str(u8 opcode)
 		return nvme_ops[opcode];
 	return "Unknown";
 }
+EXPORT_SYMBOL_GPL(nvme_get_opcode_str);
 
 const unsigned char *nvme_get_admin_opcode_str(u8 opcode)
 {
@@ -185,4 +185,3 @@ const unsigned char *nvme_get_admin_opcode_str(u8 opcode)
 		return nvme_admin_ops[opcode];
 	return "Unknown";
 }
-#endif /* CONFIG_NVME_VERBOSE_ERRORS */
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2533b88e66d5..2429b11eb9a8 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -24,12 +24,22 @@
 
 #include "nvme.h"
 #include "fabrics.h"
+#include <linux/nvme-auth.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
 #define NVME_MINORS		(1U << MINORBITS)
 
+struct nvme_ns_info {
+	struct nvme_ns_ids ids;
+	u32 nsid;
+	__le32 anagrpid;
+	bool is_shared;
+	bool is_readonly;
+	bool is_ready;
+};
+
 unsigned int admin_timeout = 60;
 module_param(admin_timeout, uint, 0644);
 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
@@ -330,6 +340,7 @@ enum nvme_disposition {
 	COMPLETE,
 	RETRY,
 	FAILOVER,
+	AUTHENTICATE,
 };
 
 static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
@@ -337,6 +348,9 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
 	if (likely(nvme_req(req)->status == 0))
 		return COMPLETE;
 
+	if ((nvme_req(req)->status & 0x7ff) == NVME_SC_AUTH_REQUIRED)
+		return AUTHENTICATE;
+
 	if (blk_noretry_request(req) ||
 	    (nvme_req(req)->status & NVME_SC_DNR) ||
 	    nvme_req(req)->retries >= nvme_max_retries)
@@ -375,11 +389,13 @@ static inline void nvme_end_req(struct request *req)
 
 void nvme_complete_rq(struct request *req)
 {
+	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
+
 	trace_nvme_complete_rq(req);
 	nvme_cleanup_cmd(req);
 
-	if (nvme_req(req)->ctrl->kas)
-		nvme_req(req)->ctrl->comp_seen = true;
+	if (ctrl->kas)
+		ctrl->comp_seen = true;
 
 	switch (nvme_decide_disposition(req)) {
 	case COMPLETE:
@@ -391,6 +407,14 @@ void nvme_complete_rq(struct request *req)
 	case FAILOVER:
 		nvme_failover_req(req);
 		return;
+	case AUTHENTICATE:
+#ifdef CONFIG_NVME_AUTH
+		queue_work(nvme_wq, &ctrl->dhchap_auth_work);
+		nvme_retry_req(req);
+#else
+		nvme_end_req(req);
+#endif
+		return;
 	}
 }
 EXPORT_SYMBOL_GPL(nvme_complete_rq);
@@ -702,7 +726,9 @@ bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
 		switch (ctrl->state) {
 		case NVME_CTRL_CONNECTING:
 			if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
-			    req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
+			    (req->cmd->fabrics.fctype == nvme_fabrics_type_connect ||
+			     req->cmd->fabrics.fctype == nvme_fabrics_type_auth_send ||
+			     req->cmd->fabrics.fctype == nvme_fabrics_type_auth_receive))
 				return true;
 			break;
 		default:
@@ -990,8 +1016,7 @@ static int nvme_execute_rq(struct request *rq, bool at_head)
  */
 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		union nvme_result *result, void *buffer, unsigned bufflen,
-		unsigned timeout, int qid, int at_head,
-		blk_mq_req_flags_t flags)
+		int qid, int at_head, blk_mq_req_flags_t flags)
 {
 	struct request *req;
 	int ret;
@@ -1000,15 +1025,12 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags);
 	else
 		req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags,
-						qid ? qid - 1 : 0);
+						qid - 1);
 
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 	nvme_init_request(req, cmd);
 
-	if (timeout)
-		req->timeout = timeout;
-
 	if (buffer && bufflen) {
 		ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
 		if (ret)
@@ -1028,7 +1050,7 @@ EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		void *buffer, unsigned bufflen)
 {
-	return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
+	return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen,
 			NVME_QID_ANY, 0, 0);
 }
 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
@@ -1329,8 +1351,8 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
 	}
 }
 
-static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
-		struct nvme_ns_ids *ids)
+static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl,
+		struct nvme_ns_info *info)
 {
 	struct nvme_command c = { };
 	bool csi_seen = false;
@@ -1343,7 +1365,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
 		return 0;
 
 	c.identify.opcode = nvme_admin_identify;
-	c.identify.nsid = cpu_to_le32(nsid);
+	c.identify.nsid = cpu_to_le32(info->nsid);
 	c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
 
 	data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
@@ -1355,7 +1377,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
 	if (status) {
 		dev_warn(ctrl->device,
 			"Identify Descriptors failed (nsid=%u, status=0x%x)\n",
-			nsid, status);
+			info->nsid, status);
 		goto free_data;
 	}
 
@@ -1365,7 +1387,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
 		if (cur->nidl == 0)
 			break;
 
-		len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
+		len = nvme_process_ns_desc(ctrl, &info->ids, cur, &csi_seen);
 		if (len < 0)
 			break;
 
@@ -1374,7 +1396,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
 
 	if (nvme_multi_css(ctrl) && !csi_seen) {
 		dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
-			 nsid);
+			 info->nsid);
 		status = -EINVAL;
 	}
 
@@ -1384,7 +1406,7 @@ free_data:
 }
 
 static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
-			struct nvme_ns_ids *ids, struct nvme_id_ns **id)
+			struct nvme_id_ns **id)
 {
 	struct nvme_command c = { };
 	int error;
@@ -1407,20 +1429,6 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 	error = NVME_SC_INVALID_NS | NVME_SC_DNR;
 	if ((*id)->ncap == 0) /* namespace not allocated or attached */
 		goto out_free_id;
-
-
-	if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
-		dev_info(ctrl->device,
-			 "Ignoring bogus Namespace Identifiers\n");
-	} else {
-		if (ctrl->vs >= NVME_VS(1, 1, 0) &&
-		    !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
-			memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64));
-		if (ctrl->vs >= NVME_VS(1, 2, 0) &&
-		    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
-			memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid));
-	}
-
 	return 0;
 
 out_free_id:
@@ -1428,30 +1436,59 @@ out_free_id:
 	return error;
 }
 
-static int nvme_identify_ns_cs_indep(struct nvme_ctrl *ctrl, unsigned nsid,
-			struct nvme_id_ns_cs_indep **id)
+static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
+		struct nvme_ns_info *info)
 {
+	struct nvme_ns_ids *ids = &info->ids;
+	struct nvme_id_ns *id;
+	int ret;
+
+	ret = nvme_identify_ns(ctrl, info->nsid, &id);
+	if (ret)
+		return ret;
+	info->anagrpid = id->anagrpid;
+	info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
+	info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
+	info->is_ready = true;
+	if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
+		dev_info(ctrl->device,
+			 "Ignoring bogus Namespace Identifiers\n");
+	} else {
+		if (ctrl->vs >= NVME_VS(1, 1, 0) &&
+		    !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
+			memcpy(ids->eui64, id->eui64, sizeof(ids->eui64));
+		if (ctrl->vs >= NVME_VS(1, 2, 0) &&
+		    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
+			memcpy(ids->nguid, id->nguid, sizeof(ids->nguid));
+	}
+	kfree(id);
+	return 0;
+}
+
+static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
+		struct nvme_ns_info *info)
+{
+	struct nvme_id_ns_cs_indep *id;
 	struct nvme_command c = {
 		.identify.opcode	= nvme_admin_identify,
-		.identify.nsid		= cpu_to_le32(nsid),
+		.identify.nsid		= cpu_to_le32(info->nsid),
 		.identify.cns		= NVME_ID_CNS_NS_CS_INDEP,
 	};
 	int ret;
 
-	*id = kmalloc(sizeof(**id), GFP_KERNEL);
-	if (!*id)
+	id = kmalloc(sizeof(*id), GFP_KERNEL);
+	if (!id)
 		return -ENOMEM;
 
-	ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
-	if (ret) {
-		dev_warn(ctrl->device,
-			 "Identify namespace (CS independent) failed (%d)\n",
-			 ret);
-		kfree(*id);
-		return ret;
+	ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
+	if (!ret) {
+		info->anagrpid = id->anagrpid;
+		info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
+		info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
+		info->is_ready = id->nstat & NVME_NSTAT_NRDY;
 	}
-
-	return 0;
+	kfree(id);
+	return ret;
 }
 
 static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
@@ -1466,7 +1503,7 @@ static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
 	c.features.dword11 = cpu_to_le32(dword11);
 
 	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
-			buffer, buflen, 0, NVME_QID_ANY, 0, 0);
+			buffer, buflen, NVME_QID_ANY, 0, 0);
 	if (ret >= 0 && result)
 		*result = le32_to_cpu(res.u32);
 	return ret;
@@ -1875,6 +1912,11 @@ static void nvme_update_disk_info(struct gendisk *disk,
 					   ns->ctrl->max_zeroes_sectors);
 }
 
+static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
+{
+	return info->is_readonly || test_bit(NVME_NS_FORCE_RO, &ns->flags);
+}
+
 static inline bool nvme_first_scan(struct gendisk *disk)
 {
 	/* nvme_alloc_ns() scans the disk prior to adding it */
@@ -1912,12 +1954,44 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
 	blk_queue_chunk_sectors(ns->queue, iob);
 }
 
-static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
+static int nvme_update_ns_info_generic(struct nvme_ns *ns,
+		struct nvme_ns_info *info)
 {
-	unsigned lbaf = nvme_lbaf_index(id->flbas);
+	blk_mq_freeze_queue(ns->disk->queue);
+	nvme_set_queue_limits(ns->ctrl, ns->queue);
+	set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
+	blk_mq_unfreeze_queue(ns->disk->queue);
+
+	if (nvme_ns_head_multipath(ns->head)) {
+		blk_mq_freeze_queue(ns->head->disk->queue);
+		set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
+		nvme_mpath_revalidate_paths(ns);
+		blk_stack_limits(&ns->head->disk->queue->limits,
+				 &ns->queue->limits, 0);
+		ns->head->disk->flags |= GENHD_FL_HIDDEN;
+		blk_mq_unfreeze_queue(ns->head->disk->queue);
+	}
+
+	/* Hide the block-interface for these devices */
+	ns->disk->flags |= GENHD_FL_HIDDEN;
+	set_bit(NVME_NS_READY, &ns->flags);
+
+	return 0;
+}
+
+static int nvme_update_ns_info_block(struct nvme_ns *ns,
+		struct nvme_ns_info *info)
+{
+	struct nvme_id_ns *id;
+	unsigned lbaf;
 	int ret;
 
+	ret = nvme_identify_ns(ns->ctrl, info->nsid, &id);
+	if (ret)
+		return ret;
+
 	blk_mq_freeze_queue(ns->disk->queue);
+	lbaf = nvme_lbaf_index(id->flbas);
 	ns->lba_shift = id->lbaf[lbaf].ds;
 	nvme_set_queue_limits(ns->ctrl, ns->queue);
 
@@ -1927,36 +2001,35 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
 
 	if (ns->head->ids.csi == NVME_CSI_ZNS) {
 		ret = nvme_update_zone_info(ns, lbaf);
-		if (ret)
-			goto out_unfreeze;
+		if (ret) {
+			blk_mq_unfreeze_queue(ns->disk->queue);
+			goto out;
+		}
 	}
 
-	set_disk_ro(ns->disk, (id->nsattr & NVME_NS_ATTR_RO) ||
-		test_bit(NVME_NS_FORCE_RO, &ns->flags));
+	set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
 	set_bit(NVME_NS_READY, &ns->flags);
 	blk_mq_unfreeze_queue(ns->disk->queue);
 
 	if (blk_queue_is_zoned(ns->queue)) {
 		ret = nvme_revalidate_zones(ns);
 		if (ret && !nvme_first_scan(ns->disk))
-			return ret;
+			goto out;
 	}
 
 	if (nvme_ns_head_multipath(ns->head)) {
 		blk_mq_freeze_queue(ns->head->disk->queue);
 		nvme_update_disk_info(ns->head->disk, ns, id);
-		set_disk_ro(ns->head->disk,
-			    (id->nsattr & NVME_NS_ATTR_RO) ||
-				    test_bit(NVME_NS_FORCE_RO, &ns->flags));
+		set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
 		nvme_mpath_revalidate_paths(ns);
 		blk_stack_limits(&ns->head->disk->queue->limits,
 				 &ns->queue->limits, 0);
 		disk_update_readahead(ns->head->disk);
 		blk_mq_unfreeze_queue(ns->head->disk->queue);
 	}
-	return 0;
 
-out_unfreeze:
+	ret = 0;
+out:
 	/*
 	 * If probing fails due an unsupported feature, hide the block device,
 	 * but still allow other access.
@@ -1966,10 +2039,31 @@ out_unfreeze:
 		set_bit(NVME_NS_READY, &ns->flags);
 		ret = 0;
 	}
-	blk_mq_unfreeze_queue(ns->disk->queue);
+	kfree(id);
 	return ret;
 }
 
+static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
+{
+	switch (info->ids.csi) {
+	case NVME_CSI_ZNS:
+		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
+			dev_info(ns->ctrl->device,
+	"block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
+				info->nsid);
+			return nvme_update_ns_info_generic(ns, info);
+		}
+		return nvme_update_ns_info_block(ns, info);
+	case NVME_CSI_NVM:
+		return nvme_update_ns_info_block(ns, info);
+	default:
+		dev_info(ns->ctrl->device,
+			"block device for nsid %u not supported (csi %u)\n",
+			info->nsid, info->ids.csi);
+		return nvme_update_ns_info_generic(ns, info);
+	}
+}
+
 static char nvme_pr_type(enum pr_type type)
 {
 	switch (type) {
@@ -2103,7 +2197,7 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 	cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
 	cmd.common.cdw11 = cpu_to_le32(len);
 
-	return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 0,
+	return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
 			NVME_QID_ANY, 1, 0);
 }
 EXPORT_SYMBOL_GPL(nvme_sec_submit);
@@ -2123,6 +2217,7 @@ static int nvme_report_zones(struct gendisk *disk, sector_t sector,
 static const struct block_device_operations nvme_bdev_ops = {
 	.owner		= THIS_MODULE,
 	.ioctl		= nvme_ioctl,
+	.compat_ioctl	= blkdev_compat_ptr_ioctl,
 	.open		= nvme_open,
 	.release	= nvme_release,
 	.getgeo		= nvme_getgeo,
@@ -3613,6 +3708,108 @@ static ssize_t dctype_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(dctype);
 
+#ifdef CONFIG_NVME_AUTH
+static ssize_t nvme_ctrl_dhchap_secret_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	struct nvmf_ctrl_options *opts = ctrl->opts;
+
+	if (!opts->dhchap_secret)
+		return sysfs_emit(buf, "none\n");
+	return sysfs_emit(buf, "%s\n", opts->dhchap_secret);
+}
+
+static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	struct nvmf_ctrl_options *opts = ctrl->opts;
+	char *dhchap_secret;
+
+	if (!ctrl->opts->dhchap_secret)
+		return -EINVAL;
+	if (count < 7)
+		return -EINVAL;
+	if (memcmp(buf, "DHHC-1:", 7))
+		return -EINVAL;
+
+	dhchap_secret = kzalloc(count + 1, GFP_KERNEL);
+	if (!dhchap_secret)
+		return -ENOMEM;
+	memcpy(dhchap_secret, buf, count);
+	nvme_auth_stop(ctrl);
+	if (strcmp(dhchap_secret, opts->dhchap_secret)) {
+		int ret;
+
+		ret = nvme_auth_generate_key(dhchap_secret, &ctrl->host_key);
+		if (ret)
+			return ret;
+		kfree(opts->dhchap_secret);
+		opts->dhchap_secret = dhchap_secret;
+		/* Key has changed; re-authentication with new key */
+		nvme_auth_reset(ctrl);
+	}
+	/* Start re-authentication */
+	dev_info(ctrl->device, "re-authenticating controller\n");
+	queue_work(nvme_wq, &ctrl->dhchap_auth_work);
+
+	return count;
+}
+static DEVICE_ATTR(dhchap_secret, S_IRUGO | S_IWUSR,
+	nvme_ctrl_dhchap_secret_show, nvme_ctrl_dhchap_secret_store);
+
+static ssize_t nvme_ctrl_dhchap_ctrl_secret_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	struct nvmf_ctrl_options *opts = ctrl->opts;
+
+	if (!opts->dhchap_ctrl_secret)
+		return sysfs_emit(buf, "none\n");
+	return sysfs_emit(buf, "%s\n", opts->dhchap_ctrl_secret);
+}
+
+static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	struct nvmf_ctrl_options *opts = ctrl->opts;
+	char *dhchap_secret;
+
+	if (!ctrl->opts->dhchap_ctrl_secret)
+		return -EINVAL;
+	if (count < 7)
+		return -EINVAL;
+	if (memcmp(buf, "DHHC-1:", 7))
+		return -EINVAL;
+
+	dhchap_secret = kzalloc(count + 1, GFP_KERNEL);
+	if (!dhchap_secret)
+		return -ENOMEM;
+	memcpy(dhchap_secret, buf, count);
+	nvme_auth_stop(ctrl);
+	if (strcmp(dhchap_secret, opts->dhchap_ctrl_secret)) {
+		int ret;
+
+		ret = nvme_auth_generate_key(dhchap_secret, &ctrl->ctrl_key);
+		if (ret)
+			return ret;
+		kfree(opts->dhchap_ctrl_secret);
+		opts->dhchap_ctrl_secret = dhchap_secret;
+		/* Key has changed; re-authentication with new key */
+		nvme_auth_reset(ctrl);
+	}
+	/* Start re-authentication */
+	dev_info(ctrl->device, "re-authenticating controller\n");
+	queue_work(nvme_wq, &ctrl->dhchap_auth_work);
+
+	return count;
+}
+static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR,
+	nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store);
+#endif
+
 static struct attribute *nvme_dev_attrs[] = {
 	&dev_attr_reset_controller.attr,
 	&dev_attr_rescan_controller.attr,
@@ -3636,6 +3833,10 @@ static struct attribute *nvme_dev_attrs[] = {
 	&dev_attr_kato.attr,
 	&dev_attr_cntrltype.attr,
 	&dev_attr_dctype.attr,
+#ifdef CONFIG_NVME_AUTH
+	&dev_attr_dhchap_secret.attr,
+	&dev_attr_dhchap_ctrl_secret.attr,
+#endif
 	NULL
 };
 
@@ -3659,6 +3860,12 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
 		return 0;
 	if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts)
 		return 0;
+#ifdef CONFIG_NVME_AUTH
+	if (a == &dev_attr_dhchap_secret.attr && !ctrl->opts)
+		return 0;
+	if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts)
+		return 0;
+#endif
 
 	return a->mode;
 }
@@ -3786,7 +3993,7 @@ static int nvme_add_ns_cdev(struct nvme_ns *ns)
 }
 
 static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
-		unsigned nsid, struct nvme_ns_ids *ids, bool is_shared)
+		struct nvme_ns_info *info)
 {
 	struct nvme_ns_head *head;
 	size_t size = sizeof(*head);
@@ -3808,9 +4015,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
 	if (ret)
 		goto out_ida_remove;
 	head->subsys = ctrl->subsys;
-	head->ns_id = nsid;
-	head->ids = *ids;
-	head->shared = is_shared;
+	head->ns_id = info->nsid;
+	head->ids = info->ids;
+	head->shared = info->is_shared;
 	kref_init(&head->ref);
 
 	if (head->ids.csi) {
@@ -3867,54 +4074,54 @@ static int nvme_global_check_duplicate_ids(struct nvme_subsystem *this,
 	return ret;
 }
 
-static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
-		struct nvme_ns_ids *ids, bool is_shared)
+static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
 {
 	struct nvme_ctrl *ctrl = ns->ctrl;
 	struct nvme_ns_head *head = NULL;
 	int ret;
 
-	ret = nvme_global_check_duplicate_ids(ctrl->subsys, ids);
+	ret = nvme_global_check_duplicate_ids(ctrl->subsys, &info->ids);
 	if (ret) {
 		dev_err(ctrl->device,
-			"globally duplicate IDs for nsid %d\n", nsid);
+			"globally duplicate IDs for nsid %d\n", info->nsid);
 		nvme_print_device_info(ctrl);
 		return ret;
 	}
 
 	mutex_lock(&ctrl->subsys->lock);
-	head = nvme_find_ns_head(ctrl, nsid);
+	head = nvme_find_ns_head(ctrl, info->nsid);
 	if (!head) {
-		ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, ids);
+		ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &info->ids);
 		if (ret) {
 			dev_err(ctrl->device,
 				"duplicate IDs in subsystem for nsid %d\n",
-				nsid);
+				info->nsid);
 			goto out_unlock;
 		}
-		head = nvme_alloc_ns_head(ctrl, nsid, ids, is_shared);
+		head = nvme_alloc_ns_head(ctrl, info);
 		if (IS_ERR(head)) {
 			ret = PTR_ERR(head);
 			goto out_unlock;
 		}
 	} else {
 		ret = -EINVAL;
-		if (!is_shared || !head->shared) {
+		if (!info->is_shared || !head->shared) {
 			dev_err(ctrl->device,
-				"Duplicate unshared namespace %d\n", nsid);
+				"Duplicate unshared namespace %d\n",
+				info->nsid);
 			goto out_put_ns_head;
 		}
-		if (!nvme_ns_ids_equal(&head->ids, ids)) {
+		if (!nvme_ns_ids_equal(&head->ids, &info->ids)) {
 			dev_err(ctrl->device,
 				"IDs don't match for shared namespace %d\n",
-					nsid);
+					info->nsid);
 			goto out_put_ns_head;
 		}
 
 		if (!multipath && !list_empty(&head->list)) {
 			dev_warn(ctrl->device,
 				"Found shared namespace %d, but multipathing not supported.\n",
-				nsid);
+				info->nsid);
 			dev_warn_once(ctrl->device,
 				"Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0\n.");
 		}
@@ -3968,20 +4175,15 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
 	list_add(&ns->list, &ns->ctrl->namespaces);
 }
 
-static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
-		struct nvme_ns_ids *ids)
+static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
 {
 	struct nvme_ns *ns;
 	struct gendisk *disk;
-	struct nvme_id_ns *id;
 	int node = ctrl->numa_node;
 
-	if (nvme_identify_ns(ctrl, nsid, ids, &id))
-		return;
-
 	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
 	if (!ns)
-		goto out_free_id;
+		return;
 
 	disk = blk_mq_alloc_disk(ctrl->tagset, ns);
 	if (IS_ERR(disk))
@@ -4002,7 +4204,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 	ns->ctrl = ctrl;
 	kref_init(&ns->kref);
 
-	if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED))
+	if (nvme_init_ns_head(ns, info))
 		goto out_cleanup_disk;
 
 	/*
@@ -4028,7 +4230,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 			ns->head->instance);
 	}
 
-	if (nvme_update_ns_info(ns, id))
+	if (nvme_update_ns_info(ns, info))
 		goto out_unlink_ns;
 
 	down_write(&ctrl->namespaces_rwsem);
@@ -4042,9 +4244,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 	if (!nvme_ns_head_multipath(ns->head))
 		nvme_add_ns_cdev(ns);
 
-	nvme_mpath_add_disk(ns, id);
+	nvme_mpath_add_disk(ns, info->anagrpid);
 	nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
-	kfree(id);
 
 	return;
 
@@ -4064,8 +4265,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 	put_disk(disk);
  out_free_ns:
 	kfree(ns);
- out_free_id:
-	kfree(id);
 }
 
 static void nvme_ns_remove(struct nvme_ns *ns)
@@ -4123,29 +4322,21 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
 	}
 }
 
-static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
+static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
 {
-	struct nvme_id_ns *id;
 	int ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
 
 	if (test_bit(NVME_NS_DEAD, &ns->flags))
 		goto out;
 
-	ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id);
-	if (ret)
-		goto out;
-
 	ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
-	if (!nvme_ns_ids_equal(&ns->head->ids, ids)) {
+	if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) {
 		dev_err(ns->ctrl->device,
 			"identifiers changed for nsid %d\n", ns->head->ns_id);
-		goto out_free_id;
+		goto out;
 	}
 
-	ret = nvme_update_ns_info(ns, id);
-
-out_free_id:
-	kfree(id);
+	ret = nvme_update_ns_info(ns, info);
 out:
 	/*
 	 * Only remove the namespace if we got a fatal error back from the
@@ -4157,59 +4348,47 @@ out:
 		nvme_ns_remove(ns);
 }
 
-static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 {
-	struct nvme_ns_ids ids = { };
-	struct nvme_id_ns_cs_indep *id;
+	struct nvme_ns_info info = { .nsid = nsid };
 	struct nvme_ns *ns;
-	bool ready = true;
 
-	if (nvme_identify_ns_descs(ctrl, nsid, &ids))
+	if (nvme_identify_ns_descs(ctrl, &info))
 		return;
 
-	/*
-	 * Check if the namespace is ready.  If not ignore it, we will get an
-	 * AEN once it becomes ready and restart the scan.
-	 */
-	if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) &&
-	    !nvme_identify_ns_cs_indep(ctrl, nsid, &id)) {
-		ready = id->nstat & NVME_NSTAT_NRDY;
-		kfree(id);
+	if (info.ids.csi != NVME_CSI_NVM && !nvme_multi_css(ctrl)) {
+		dev_warn(ctrl->device,
+			"command set not reported for nsid: %d\n", nsid);
+		return;
 	}
 
-	if (!ready)
+	/*
+	 * If available try to use the Command Set Idependent Identify Namespace
+	 * data structure to find all the generic information that is needed to
+	 * set up a namespace.  If not fall back to the legacy version.
+	 */
+	if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) ||
+	    (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS)) {
+		if (nvme_ns_info_from_id_cs_indep(ctrl, &info))
+			return;
+	} else {
+		if (nvme_ns_info_from_identify(ctrl, &info))
+			return;
+	}
+
+	/*
+	 * Ignore the namespace if it is not ready. We will get an AEN once it
+	 * becomes ready and restart the scan.
+	 */
+	if (!info.is_ready)
 		return;
 
 	ns = nvme_find_get_ns(ctrl, nsid);
 	if (ns) {
-		nvme_validate_ns(ns, &ids);
+		nvme_validate_ns(ns, &info);
 		nvme_put_ns(ns);
-		return;
-	}
-
-	switch (ids.csi) {
-	case NVME_CSI_NVM:
-		nvme_alloc_ns(ctrl, nsid, &ids);
-		break;
-	case NVME_CSI_ZNS:
-		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
-			dev_warn(ctrl->device,
-				"nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
-				nsid);
-			break;
-		}
-		if (!nvme_multi_css(ctrl)) {
-			dev_warn(ctrl->device,
-				"command set not reported for nsid: %d\n",
-				nsid);
-			break;
-		}
-		nvme_alloc_ns(ctrl, nsid, &ids);
-		break;
-	default:
-		dev_warn(ctrl->device, "unknown csi %u for nsid %u\n",
-			ids.csi, nsid);
-		break;
+	} else {
+		nvme_alloc_ns(ctrl, &info);
 	}
 }
 
@@ -4265,7 +4444,7 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
 
 			if (!nsid)	/* end of the list? */
 				goto out;
-			nvme_validate_or_alloc_ns(ctrl, nsid);
+			nvme_scan_ns(ctrl, nsid);
 			while (++prev < nsid)
 				nvme_ns_remove_by_nsid(ctrl, prev);
 		}
@@ -4288,7 +4467,7 @@ static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
 	kfree(id);
 
 	for (i = 1; i <= nn; i++)
-		nvme_validate_or_alloc_ns(ctrl, i);
+		nvme_scan_ns(ctrl, i);
 
 	nvme_remove_invalid_namespaces(ctrl, nn);
 }
@@ -4525,9 +4704,19 @@ static void nvme_fw_act_work(struct work_struct *work)
 	nvme_get_fw_slot_info(ctrl);
 }
 
+static u32 nvme_aer_type(u32 result)
+{
+	return result & 0x7;
+}
+
+static u32 nvme_aer_subtype(u32 result)
+{
+	return (result & 0xff00) >> 8;
+}
+
 static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
 {
-	u32 aer_notice_type = (result & 0xff00) >> 8;
+	u32 aer_notice_type = nvme_aer_subtype(result);
 
 	trace_nvme_async_event(ctrl, aer_notice_type);
 
@@ -4542,8 +4731,10 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
 		 * recovery actions from interfering with the controller's
 		 * firmware activation.
 		 */
-		if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+		if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
+			nvme_auth_stop(ctrl);
 			queue_work(nvme_wq, &ctrl->fw_act_work);
+		}
 		break;
 #ifdef CONFIG_NVME_MULTIPATH
 	case NVME_AER_NOTICE_ANA:
@@ -4560,11 +4751,19 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
 	}
 }
 
+static void nvme_handle_aer_persistent_error(struct nvme_ctrl *ctrl)
+{
+	trace_nvme_async_event(ctrl, NVME_AER_ERROR);
+	dev_warn(ctrl->device, "resetting controller due to AER\n");
+	nvme_reset_ctrl(ctrl);
+}
+
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		volatile union nvme_result *res)
 {
 	u32 result = le32_to_cpu(res->u32);
-	u32 aer_type = result & 0x07;
+	u32 aer_type = nvme_aer_type(result);
+	u32 aer_subtype = nvme_aer_subtype(result);
 
 	if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
 		return;
@@ -4574,6 +4773,15 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		nvme_handle_aen_notice(ctrl, result);
 		break;
 	case NVME_AER_ERROR:
+		/*
+		 * For a persistent internal error, don't run async_event_work
+		 * to submit a new AER. The controller reset will do it.
+		 */
+		if (aer_subtype == NVME_AER_ERROR_PERSIST_INT_ERR) {
+			nvme_handle_aer_persistent_error(ctrl);
+			return;
+		}
+		fallthrough;
 	case NVME_AER_SMART:
 	case NVME_AER_CSS:
 	case NVME_AER_VS:
@@ -4590,6 +4798,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
 void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
 {
 	nvme_mpath_stop(ctrl);
+	nvme_auth_stop(ctrl);
 	nvme_stop_keep_alive(ctrl);
 	nvme_stop_failfast_work(ctrl);
 	flush_work(&ctrl->async_event_work);
@@ -4649,6 +4858,8 @@ static void nvme_free_ctrl(struct device *dev)
 
 	nvme_free_cels(ctrl);
 	nvme_mpath_uninit(ctrl);
+	nvme_auth_stop(ctrl);
+	nvme_auth_free(ctrl);
 	__free_page(ctrl->discard_page);
 
 	if (subsys) {
@@ -4739,6 +4950,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 
 	nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
 	nvme_mpath_init_ctrl(ctrl);
+	nvme_auth_init_ctrl(ctrl);
 
 	return 0;
 out_free_name:
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index ee79a6d639b4..5207a2348257 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -152,7 +152,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
 	cmd.prop_get.fctype = nvme_fabrics_type_property_get;
 	cmd.prop_get.offset = cpu_to_le32(off);
 
-	ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0,
+	ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0,
 			NVME_QID_ANY, 0, 0);
 
 	if (ret >= 0)
@@ -198,7 +198,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
 	cmd.prop_get.attrib = 1;
 	cmd.prop_get.offset = cpu_to_le32(off);
 
-	ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0,
+	ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0,
 			NVME_QID_ANY, 0, 0);
 
 	if (ret >= 0)
@@ -243,7 +243,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
 	cmd.prop_set.offset = cpu_to_le32(off);
 	cmd.prop_set.value = cpu_to_le64(val);
 
-	ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 0,
+	ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0,
 			NVME_QID_ANY, 0, 0);
 	if (unlikely(ret))
 		dev_err(ctrl->device,
@@ -331,6 +331,10 @@ static void nvmf_log_connect_error(struct nvme_ctrl *ctrl,
 		dev_err(ctrl->device,
 			"Connect command failed: host path error\n");
 		break;
+	case NVME_SC_AUTH_REQUIRED:
+		dev_err(ctrl->device,
+			"Connect command failed: authentication required\n");
+		break;
 	default:
 		dev_err(ctrl->device,
 			"Connect command failed, error wo/DNR bit: %d\n",
@@ -365,6 +369,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
 	union nvme_result res;
 	struct nvmf_connect_data *data;
 	int ret;
+	u32 result;
 
 	cmd.connect.opcode = nvme_fabrics_command;
 	cmd.connect.fctype = nvme_fabrics_type_connect;
@@ -389,7 +394,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
 	strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
 
 	ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res,
-			data, sizeof(*data), 0, NVME_QID_ANY, 1,
+			data, sizeof(*data), NVME_QID_ANY, 1,
 			BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
 	if (ret) {
 		nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
@@ -397,8 +402,25 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
 		goto out_free_data;
 	}
 
-	ctrl->cntlid = le16_to_cpu(res.u16);
-
+	result = le32_to_cpu(res.u32);
+	ctrl->cntlid = result & 0xFFFF;
+	if ((result >> 16) & 0x3) {
+		/* Authentication required */
+		ret = nvme_auth_negotiate(ctrl, 0);
+		if (ret) {
+			dev_warn(ctrl->device,
+				 "qid 0: authentication setup failed\n");
+			ret = NVME_SC_AUTH_REQUIRED;
+			goto out_free_data;
+		}
+		ret = nvme_auth_wait(ctrl, 0);
+		if (ret)
+			dev_warn(ctrl->device,
+				 "qid 0: authentication failed\n");
+		else
+			dev_info(ctrl->device,
+				 "qid 0: authenticated\n");
+	}
 out_free_data:
 	kfree(data);
 	return ret;
@@ -431,6 +453,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
 	struct nvmf_connect_data *data;
 	union nvme_result res;
 	int ret;
+	u32 result;
 
 	cmd.connect.opcode = nvme_fabrics_command;
 	cmd.connect.fctype = nvme_fabrics_type_connect;
@@ -450,12 +473,27 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
 	strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
 
 	ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res,
-			data, sizeof(*data), 0, qid, 1,
+			data, sizeof(*data), qid, 1,
 			BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
 	if (ret) {
 		nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
 				       &cmd, data);
 	}
+	result = le32_to_cpu(res.u32);
+	if ((result >> 16) & 2) {
+		/* Authentication required */
+		ret = nvme_auth_negotiate(ctrl, qid);
+		if (ret) {
+			dev_warn(ctrl->device,
+				 "qid %d: authentication setup failed\n", qid);
+			ret = NVME_SC_AUTH_REQUIRED;
+		} else {
+			ret = nvme_auth_wait(ctrl, qid);
+			if (ret)
+				dev_warn(ctrl->device,
+					 "qid %u: authentication failed\n", qid);
+		}
+	}
 	kfree(data);
 	return ret;
 }
@@ -548,6 +586,8 @@ static const match_table_t opt_tokens = {
 	{ NVMF_OPT_TOS,			"tos=%d"		},
 	{ NVMF_OPT_FAIL_FAST_TMO,	"fast_io_fail_tmo=%d"	},
 	{ NVMF_OPT_DISCOVERY,		"discovery"		},
+	{ NVMF_OPT_DHCHAP_SECRET,	"dhchap_secret=%s"	},
+	{ NVMF_OPT_DHCHAP_CTRL_SECRET,	"dhchap_ctrl_secret=%s"	},
 	{ NVMF_OPT_ERR,			NULL			}
 };
 
@@ -829,6 +869,34 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 		case NVMF_OPT_DISCOVERY:
 			opts->discovery_nqn = true;
 			break;
+		case NVMF_OPT_DHCHAP_SECRET:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (strlen(p) < 11 || strncmp(p, "DHHC-1:", 7)) {
+				pr_err("Invalid DH-CHAP secret %s\n", p);
+				ret = -EINVAL;
+				goto out;
+			}
+			kfree(opts->dhchap_secret);
+			opts->dhchap_secret = p;
+			break;
+		case NVMF_OPT_DHCHAP_CTRL_SECRET:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (strlen(p) < 11 || strncmp(p, "DHHC-1:", 7)) {
+				pr_err("Invalid DH-CHAP secret %s\n", p);
+				ret = -EINVAL;
+				goto out;
+			}
+			kfree(opts->dhchap_ctrl_secret);
+			opts->dhchap_ctrl_secret = p;
+			break;
 		default:
 			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
 				p);
@@ -947,6 +1015,8 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts)
 	kfree(opts->subsysnqn);
 	kfree(opts->host_traddr);
 	kfree(opts->host_iface);
+	kfree(opts->dhchap_secret);
+	kfree(opts->dhchap_ctrl_secret);
 	kfree(opts);
 }
 EXPORT_SYMBOL_GPL(nvmf_free_options);
@@ -956,7 +1026,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options);
 				 NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
 				 NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
 				 NVMF_OPT_DISABLE_SQFLOW | NVMF_OPT_DISCOVERY |\
-				 NVMF_OPT_FAIL_FAST_TMO)
+				 NVMF_OPT_FAIL_FAST_TMO | NVMF_OPT_DHCHAP_SECRET |\
+				 NVMF_OPT_DHCHAP_CTRL_SECRET)
 
 static struct nvme_ctrl *
 nvmf_create_ctrl(struct device *dev, const char *buf)
@@ -1192,7 +1263,14 @@ static void __exit nvmf_exit(void)
 	BUILD_BUG_ON(sizeof(struct nvmf_connect_command) != 64);
 	BUILD_BUG_ON(sizeof(struct nvmf_property_get_command) != 64);
 	BUILD_BUG_ON(sizeof(struct nvmf_property_set_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_send_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_receive_command) != 64);
 	BUILD_BUG_ON(sizeof(struct nvmf_connect_data) != 1024);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_negotiate_data) != 8);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_challenge_data) != 16);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_reply_data) != 16);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_success1_data) != 16);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_success2_data) != 16);
 }
 
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 46d6e194ac2b..a6e22116e139 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -68,6 +68,8 @@ enum {
 	NVMF_OPT_FAIL_FAST_TMO	= 1 << 20,
 	NVMF_OPT_HOST_IFACE	= 1 << 21,
 	NVMF_OPT_DISCOVERY	= 1 << 22,
+	NVMF_OPT_DHCHAP_SECRET	= 1 << 23,
+	NVMF_OPT_DHCHAP_CTRL_SECRET = 1 << 24,
 };
 
 /**
@@ -97,6 +99,9 @@ enum {
  * @max_reconnects: maximum number of allowed reconnect attempts before removing
  *              the controller, (-1) means reconnect forever, zero means remove
  *              immediately;
+ * @dhchap_secret: DH-HMAC-CHAP secret
+ * @dhchap_ctrl_secret: DH-HMAC-CHAP controller secret for bi-directional
+ *              authentication
  * @disable_sqflow: disable controller sq flow control
  * @hdr_digest: generate/verify header digest (TCP)
  * @data_digest: generate/verify data digest (TCP)
@@ -121,6 +126,8 @@ struct nvmf_ctrl_options {
 	unsigned int		kato;
 	struct nvmf_host	*host;
 	int			max_reconnects;
+	char			*dhchap_secret;
+	char			*dhchap_ctrl_secret;
 	bool			disable_sqflow;
 	bool			hdr_digest;
 	bool			data_digest;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index f26640ccb955..6ef497c75a16 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -346,7 +346,7 @@ static void nvme_ns_head_submit_bio(struct bio *bio)
 	 * different queue via blk_steal_bios(), so we need to use the bio_split
 	 * pool from the original queue to allocate the bvecs from.
 	 */
-	blk_queue_split(&bio);
+	bio = bio_split_to_limits(bio);
 
 	srcu_idx = srcu_read_lock(&head->srcu);
 	ns = nvme_find_path(head);
@@ -408,6 +408,7 @@ const struct block_device_operations nvme_ns_head_ops = {
 	.open		= nvme_ns_head_open,
 	.release	= nvme_ns_head_release,
 	.ioctl		= nvme_ns_head_ioctl,
+	.compat_ioctl	= blkdev_compat_ptr_ioctl,
 	.getgeo		= nvme_getgeo,
 	.report_zones	= nvme_ns_head_report_zones,
 	.pr_ops		= &nvme_pr_ops,
@@ -800,16 +801,16 @@ static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
 	return -ENXIO; /* just break out of the loop */
 }
 
-void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
+void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
 {
 	if (nvme_ctrl_use_ana(ns->ctrl)) {
 		struct nvme_ana_group_desc desc = {
-			.grpid = id->anagrpid,
+			.grpid = anagrpid,
 			.state = 0,
 		};
 
 		mutex_lock(&ns->ctrl->ana_lock);
-		ns->ana_grpid = le32_to_cpu(id->anagrpid);
+		ns->ana_grpid = le32_to_cpu(anagrpid);
 		nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
 		mutex_unlock(&ns->ctrl->ana_lock);
 		if (desc.state) {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 7e0a925bf3be..bdc0ff7ed9ab 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -140,7 +140,7 @@ enum nvme_quirks {
 	NVME_QUIRK_DMA_ADDRESS_BITS_48		= (1 << 16),
 
 	/*
-	 * The controller requires the command_id value be be limited, so skip
+	 * The controller requires the command_id value be limited, so skip
 	 * encoding the generation sequence number.
 	 */
 	NVME_QUIRK_SKIP_CID_GEN			= (1 << 17),
@@ -328,6 +328,15 @@ struct nvme_ctrl {
 	struct work_struct ana_work;
 #endif
 
+#ifdef CONFIG_NVME_AUTH
+	struct work_struct dhchap_auth_work;
+	struct list_head dhchap_auth_list;
+	struct mutex dhchap_auth_mutex;
+	struct nvme_dhchap_key *host_key;
+	struct nvme_dhchap_key *ctrl_key;
+	u16 transaction;
+#endif
+
 	/* Power saving configuration */
 	u64 ps_max_latency_us;
 	bool apst_enabled;
@@ -781,7 +790,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		void *buf, unsigned bufflen);
 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		union nvme_result *result, void *buffer, unsigned bufflen,
-		unsigned timeout, int qid, int at_head,
+		int qid, int at_head,
 		blk_mq_req_flags_t flags);
 int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
 		      unsigned int dword11, void *buffer, size_t buflen,
@@ -837,7 +846,7 @@ void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys);
 void nvme_failover_req(struct request *req);
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
-void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
+void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid);
 void nvme_mpath_remove_disk(struct nvme_ns_head *head);
 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
 void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl);
@@ -879,8 +888,7 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,
 {
 	return 0;
 }
-static inline void nvme_mpath_add_disk(struct nvme_ns *ns,
-		struct nvme_id_ns *id)
+static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
 {
 }
 static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
@@ -992,6 +1000,27 @@ static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl)
 	return ctrl->sgls & ((1 << 0) | (1 << 1));
 }
 
+#ifdef CONFIG_NVME_AUTH
+void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl);
+void nvme_auth_stop(struct nvme_ctrl *ctrl);
+int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid);
+int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid);
+void nvme_auth_reset(struct nvme_ctrl *ctrl);
+void nvme_auth_free(struct nvme_ctrl *ctrl);
+#else
+static inline void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) {};
+static inline void nvme_auth_stop(struct nvme_ctrl *ctrl) {};
+static inline int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid)
+{
+	return -EPROTONOSUPPORT;
+}
+static inline int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid)
+{
+	return NVME_SC_AUTH_REQUIRED;
+}
+static inline void nvme_auth_free(struct nvme_ctrl *ctrl) {};
+#endif
+
 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 			 u8 opcode);
 int nvme_execute_passthru_rq(struct request *rq);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 7e7d4802ac6b..71a4f26ba476 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -670,7 +670,6 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
 
 	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
 	if (!prp_list) {
-		iod->first_dma = dma_addr;
 		iod->npages = -1;
 		return BLK_STS_RESOURCE;
 	}
@@ -1435,8 +1434,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
 	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
 
 	dev_warn(nvmeq->dev->ctrl.device,
-		"I/O %d QID %d timeout, aborting\n",
-		 req->tag, nvmeq->qid);
+		"I/O %d (%s) QID %d timeout, aborting\n",
+		 req->tag,
+		 nvme_get_opcode_str(nvme_req(req)->cmd->common.opcode),
+		 nvmeq->qid);
 
 	abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd),
 					 BLK_MQ_REQ_NOWAIT);
@@ -1765,37 +1766,35 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev)
 	}
 }
 
-static int nvme_alloc_admin_tags(struct nvme_dev *dev)
+static int nvme_pci_alloc_admin_tag_set(struct nvme_dev *dev)
 {
-	if (!dev->ctrl.admin_q) {
-		dev->admin_tagset.ops = &nvme_mq_admin_ops;
-		dev->admin_tagset.nr_hw_queues = 1;
+	struct blk_mq_tag_set *set = &dev->admin_tagset;
 
-		dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
-		dev->admin_tagset.timeout = NVME_ADMIN_TIMEOUT;
-		dev->admin_tagset.numa_node = dev->ctrl.numa_node;
-		dev->admin_tagset.cmd_size = sizeof(struct nvme_iod);
-		dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
-		dev->admin_tagset.driver_data = dev;
+	set->ops = &nvme_mq_admin_ops;
+	set->nr_hw_queues = 1;
 
-		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
-			return -ENOMEM;
-		dev->ctrl.admin_tagset = &dev->admin_tagset;
+	set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
+	set->timeout = NVME_ADMIN_TIMEOUT;
+	set->numa_node = dev->ctrl.numa_node;
+	set->cmd_size = sizeof(struct nvme_iod);
+	set->flags = BLK_MQ_F_NO_SCHED;
+	set->driver_data = dev;
 
-		dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
-		if (IS_ERR(dev->ctrl.admin_q)) {
-			blk_mq_free_tag_set(&dev->admin_tagset);
-			dev->ctrl.admin_q = NULL;
-			return -ENOMEM;
-		}
-		if (!blk_get_queue(dev->ctrl.admin_q)) {
-			nvme_dev_remove_admin(dev);
-			dev->ctrl.admin_q = NULL;
-			return -ENODEV;
-		}
-	} else
-		nvme_start_admin_queue(&dev->ctrl);
+	if (blk_mq_alloc_tag_set(set))
+		return -ENOMEM;
+	dev->ctrl.admin_tagset = set;
 
+	dev->ctrl.admin_q = blk_mq_init_queue(set);
+	if (IS_ERR(dev->ctrl.admin_q)) {
+		blk_mq_free_tag_set(set);
+		dev->ctrl.admin_q = NULL;
+		return -ENOMEM;
+	}
+	if (!blk_get_queue(dev->ctrl.admin_q)) {
+		nvme_dev_remove_admin(dev);
+		dev->ctrl.admin_q = NULL;
+		return -ENODEV;
+	}
 	return 0;
 }
 
@@ -2534,47 +2533,45 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
 	return true;
 }
 
-static void nvme_dev_add(struct nvme_dev *dev)
+static void nvme_pci_alloc_tag_set(struct nvme_dev *dev)
 {
+	struct blk_mq_tag_set * set = &dev->tagset;
 	int ret;
 
-	if (!dev->ctrl.tagset) {
-		dev->tagset.ops = &nvme_mq_ops;
-		dev->tagset.nr_hw_queues = dev->online_queues - 1;
-		dev->tagset.nr_maps = 2; /* default + read */
-		if (dev->io_queues[HCTX_TYPE_POLL])
-			dev->tagset.nr_maps++;
-		dev->tagset.timeout = NVME_IO_TIMEOUT;
-		dev->tagset.numa_node = dev->ctrl.numa_node;
-		dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth,
-						BLK_MQ_MAX_DEPTH) - 1;
-		dev->tagset.cmd_size = sizeof(struct nvme_iod);
-		dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
-		dev->tagset.driver_data = dev;
+	set->ops = &nvme_mq_ops;
+	set->nr_hw_queues = dev->online_queues - 1;
+	set->nr_maps = 2; /* default + read */
+	if (dev->io_queues[HCTX_TYPE_POLL])
+		set->nr_maps++;
+	set->timeout = NVME_IO_TIMEOUT;
+	set->numa_node = dev->ctrl.numa_node;
+	set->queue_depth = min_t(unsigned, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
+	set->cmd_size = sizeof(struct nvme_iod);
+	set->flags = BLK_MQ_F_SHOULD_MERGE;
+	set->driver_data = dev;
 
-		/*
-		 * Some Apple controllers requires tags to be unique
-		 * across admin and IO queue, so reserve the first 32
-		 * tags of the IO queue.
-		 */
-		if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
-			dev->tagset.reserved_tags = NVME_AQ_DEPTH;
+	/*
+	 * Some Apple controllers requires tags to be unique
+	 * across admin and IO queue, so reserve the first 32
+	 * tags of the IO queue.
+	 */
+	if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
+		set->reserved_tags = NVME_AQ_DEPTH;
 
-		ret = blk_mq_alloc_tag_set(&dev->tagset);
-		if (ret) {
-			dev_warn(dev->ctrl.device,
-				"IO queues tagset allocation failed %d\n", ret);
-			return;
-		}
-		dev->ctrl.tagset = &dev->tagset;
-	} else {
-		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
-
-		/* Free previously allocated queues that are no longer usable */
-		nvme_free_queues(dev, dev->online_queues);
+	ret = blk_mq_alloc_tag_set(set);
+	if (ret) {
+		dev_warn(dev->ctrl.device,
+			"IO queues tagset allocation failed %d\n", ret);
+		return;
 	}
+	dev->ctrl.tagset = set;
+}
 
-	nvme_dbbuf_set(dev);
+static void nvme_pci_update_nr_queues(struct nvme_dev *dev)
+{
+	blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
+	/* free previously allocated queues that are no longer usable */
+	nvme_free_queues(dev, dev->online_queues);
 }
 
 static int nvme_pci_enable(struct nvme_dev *dev)
@@ -2725,10 +2722,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	nvme_pci_disable(dev);
 	nvme_reap_pending_cqes(dev);
 
-	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
-	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
-	blk_mq_tagset_wait_completed_request(&dev->tagset);
-	blk_mq_tagset_wait_completed_request(&dev->admin_tagset);
+	nvme_cancel_tagset(&dev->ctrl);
+	nvme_cancel_admin_tagset(&dev->ctrl);
 
 	/*
 	 * The driver will not be starting up queues again if shutting down so
@@ -2842,9 +2837,13 @@ static void nvme_reset_work(struct work_struct *work)
 	if (result)
 		goto out_unlock;
 
-	result = nvme_alloc_admin_tags(dev);
-	if (result)
-		goto out_unlock;
+	if (!dev->ctrl.admin_q) {
+		result = nvme_pci_alloc_admin_tag_set(dev);
+		if (result)
+			goto out_unlock;
+	} else {
+		nvme_start_admin_queue(&dev->ctrl);
+	}
 
 	/*
 	 * Limit the max command size to prevent iod->sg allocations going
@@ -2923,7 +2922,11 @@ static void nvme_reset_work(struct work_struct *work)
 	} else {
 		nvme_start_queues(&dev->ctrl);
 		nvme_wait_freeze(&dev->ctrl);
-		nvme_dev_add(dev);
+		if (!dev->ctrl.tagset)
+			nvme_pci_alloc_tag_set(dev);
+		else
+			nvme_pci_update_nr_queues(dev);
+		nvme_dbbuf_set(dev);
 		nvme_unfreeze(&dev->ctrl);
 	}
 
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 4665aebd944d..3100643be299 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -29,7 +29,7 @@
 #include "fabrics.h"
 
 
-#define NVME_RDMA_CONNECT_TIMEOUT_MS	3000		/* 3 second */
+#define NVME_RDMA_CM_TIMEOUT_MS		3000		/* 3 second */
 
 #define NVME_RDMA_MAX_SEGMENTS		256
 
@@ -248,12 +248,9 @@ static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
 {
 	int ret;
 
-	ret = wait_for_completion_interruptible_timeout(&queue->cm_done,
-			msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1);
-	if (ret < 0)
+	ret = wait_for_completion_interruptible(&queue->cm_done);
+	if (ret)
 		return ret;
-	if (ret == 0)
-		return -ETIMEDOUT;
 	WARN_ON_ONCE(queue->cm_error > 0);
 	return queue->cm_error;
 }
@@ -612,7 +609,7 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
 	queue->cm_error = -ETIMEDOUT;
 	ret = rdma_resolve_addr(queue->cm_id, src_addr,
 			(struct sockaddr *)&ctrl->addr,
-			NVME_RDMA_CONNECT_TIMEOUT_MS);
+			NVME_RDMA_CM_TIMEOUT_MS);
 	if (ret) {
 		dev_info(ctrl->ctrl.device,
 			"rdma_resolve_addr failed (%d).\n", ret);
@@ -790,50 +787,54 @@ out_free_queues:
 	return ret;
 }
 
-static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
-		bool admin)
+static int nvme_rdma_alloc_admin_tag_set(struct nvme_ctrl *nctrl)
 {
 	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
-	struct blk_mq_tag_set *set;
+	struct blk_mq_tag_set *set = &ctrl->admin_tag_set;
 	int ret;
 
-	if (admin) {
-		set = &ctrl->admin_tag_set;
-		memset(set, 0, sizeof(*set));
-		set->ops = &nvme_rdma_admin_mq_ops;
-		set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
-		set->reserved_tags = NVMF_RESERVED_TAGS;
-		set->numa_node = nctrl->numa_node;
-		set->cmd_size = sizeof(struct nvme_rdma_request) +
-				NVME_RDMA_DATA_SGL_SIZE;
-		set->driver_data = ctrl;
-		set->nr_hw_queues = 1;
-		set->timeout = NVME_ADMIN_TIMEOUT;
-		set->flags = BLK_MQ_F_NO_SCHED;
-	} else {
-		set = &ctrl->tag_set;
-		memset(set, 0, sizeof(*set));
-		set->ops = &nvme_rdma_mq_ops;
-		set->queue_depth = nctrl->sqsize + 1;
-		set->reserved_tags = NVMF_RESERVED_TAGS;
-		set->numa_node = nctrl->numa_node;
-		set->flags = BLK_MQ_F_SHOULD_MERGE;
-		set->cmd_size = sizeof(struct nvme_rdma_request) +
-				NVME_RDMA_DATA_SGL_SIZE;
-		if (nctrl->max_integrity_segments)
-			set->cmd_size += sizeof(struct nvme_rdma_sgl) +
-					 NVME_RDMA_METADATA_SGL_SIZE;
-		set->driver_data = ctrl;
-		set->nr_hw_queues = nctrl->queue_count - 1;
-		set->timeout = NVME_IO_TIMEOUT;
-		set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
-	}
-
+	memset(set, 0, sizeof(*set));
+	set->ops = &nvme_rdma_admin_mq_ops;
+	set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
+	set->reserved_tags = NVMF_RESERVED_TAGS;
+	set->numa_node = nctrl->numa_node;
+	set->cmd_size = sizeof(struct nvme_rdma_request) +
+			NVME_RDMA_DATA_SGL_SIZE;
+	set->driver_data = ctrl;
+	set->nr_hw_queues = 1;
+	set->timeout = NVME_ADMIN_TIMEOUT;
+	set->flags = BLK_MQ_F_NO_SCHED;
 	ret = blk_mq_alloc_tag_set(set);
-	if (ret)
-		return ERR_PTR(ret);
+	if (!ret)
+		ctrl->ctrl.admin_tagset = set;
+	return ret;
+}
 
-	return set;
+static int nvme_rdma_alloc_tag_set(struct nvme_ctrl *nctrl)
+{
+	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
+	struct blk_mq_tag_set *set = &ctrl->tag_set;
+	int ret;
+
+	memset(set, 0, sizeof(*set));
+	set->ops = &nvme_rdma_mq_ops;
+	set->queue_depth = nctrl->sqsize + 1;
+	set->reserved_tags = NVMF_RESERVED_TAGS;
+	set->numa_node = nctrl->numa_node;
+	set->flags = BLK_MQ_F_SHOULD_MERGE;
+	set->cmd_size = sizeof(struct nvme_rdma_request) +
+			NVME_RDMA_DATA_SGL_SIZE;
+	if (nctrl->max_integrity_segments)
+		set->cmd_size += sizeof(struct nvme_rdma_sgl) +
+				 NVME_RDMA_METADATA_SGL_SIZE;
+	set->driver_data = ctrl;
+	set->nr_hw_queues = nctrl->queue_count - 1;
+	set->timeout = NVME_IO_TIMEOUT;
+	set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
+	ret = blk_mq_alloc_tag_set(set);
+	if (!ret)
+		ctrl->ctrl.tagset = set;
+	return ret;
 }
 
 static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
@@ -885,11 +886,9 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
 		goto out_free_queue;
 
 	if (new) {
-		ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true);
-		if (IS_ERR(ctrl->ctrl.admin_tagset)) {
-			error = PTR_ERR(ctrl->ctrl.admin_tagset);
+		error = nvme_rdma_alloc_admin_tag_set(&ctrl->ctrl);
+		if (error)
 			goto out_free_async_qe;
-		}
 
 		ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
 		if (IS_ERR(ctrl->ctrl.fabrics_q)) {
@@ -972,11 +971,9 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
 		return ret;
 
 	if (new) {
-		ctrl->ctrl.tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, false);
-		if (IS_ERR(ctrl->ctrl.tagset)) {
-			ret = PTR_ERR(ctrl->ctrl.tagset);
+		ret = nvme_rdma_alloc_tag_set(&ctrl->ctrl);
+		if (ret)
 			goto out_free_io_queues;
-		}
 
 		ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl));
 		if (ret)
@@ -1205,6 +1202,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
 	struct nvme_rdma_ctrl *ctrl = container_of(work,
 			struct nvme_rdma_ctrl, err_work);
 
+	nvme_auth_stop(&ctrl->ctrl);
 	nvme_stop_keep_alive(&ctrl->ctrl);
 	flush_work(&ctrl->ctrl.async_event_work);
 	nvme_rdma_teardown_io_queues(ctrl, false);
@@ -1894,7 +1892,7 @@ static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
 
 	if (ctrl->opts->tos >= 0)
 		rdma_set_service_type(queue->cm_id, ctrl->opts->tos);
-	ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
+	ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CM_TIMEOUT_MS);
 	if (ret) {
 		dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n",
 			queue->cm_error);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index b95ee85053e3..e82dcfcda29b 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -209,9 +209,11 @@ static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
 	return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
 }
 
-static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
+static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request *req)
 {
-	return queue->cmnd_capsule_len - sizeof(struct nvme_command);
+	if (nvme_is_fabrics(req->req.cmd))
+		return NVME_TCP_ADMIN_CCSZ;
+	return req->queue->cmnd_capsule_len - sizeof(struct nvme_command);
 }
 
 static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
@@ -229,7 +231,7 @@ static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
 	rq = blk_mq_rq_from_pdu(req);
 
 	return rq_data_dir(rq) == WRITE && req->data_len &&
-		req->data_len <= nvme_tcp_inline_data_size(req->queue);
+		req->data_len <= nvme_tcp_inline_data_size(req);
 }
 
 static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
@@ -1685,45 +1687,49 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
 	return ret;
 }
 
-static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
-		bool admin)
+static int nvme_tcp_alloc_admin_tag_set(struct nvme_ctrl *nctrl)
 {
 	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
-	struct blk_mq_tag_set *set;
+	struct blk_mq_tag_set *set = &ctrl->admin_tag_set;
 	int ret;
 
-	if (admin) {
-		set = &ctrl->admin_tag_set;
-		memset(set, 0, sizeof(*set));
-		set->ops = &nvme_tcp_admin_mq_ops;
-		set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
-		set->reserved_tags = NVMF_RESERVED_TAGS;
-		set->numa_node = nctrl->numa_node;
-		set->flags = BLK_MQ_F_BLOCKING;
-		set->cmd_size = sizeof(struct nvme_tcp_request);
-		set->driver_data = ctrl;
-		set->nr_hw_queues = 1;
-		set->timeout = NVME_ADMIN_TIMEOUT;
-	} else {
-		set = &ctrl->tag_set;
-		memset(set, 0, sizeof(*set));
-		set->ops = &nvme_tcp_mq_ops;
-		set->queue_depth = nctrl->sqsize + 1;
-		set->reserved_tags = NVMF_RESERVED_TAGS;
-		set->numa_node = nctrl->numa_node;
-		set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
-		set->cmd_size = sizeof(struct nvme_tcp_request);
-		set->driver_data = ctrl;
-		set->nr_hw_queues = nctrl->queue_count - 1;
-		set->timeout = NVME_IO_TIMEOUT;
-		set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
-	}
-
+	memset(set, 0, sizeof(*set));
+	set->ops = &nvme_tcp_admin_mq_ops;
+	set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
+	set->reserved_tags = NVMF_RESERVED_TAGS;
+	set->numa_node = nctrl->numa_node;
+	set->flags = BLK_MQ_F_BLOCKING;
+	set->cmd_size = sizeof(struct nvme_tcp_request);
+	set->driver_data = ctrl;
+	set->nr_hw_queues = 1;
+	set->timeout = NVME_ADMIN_TIMEOUT;
 	ret = blk_mq_alloc_tag_set(set);
-	if (ret)
-		return ERR_PTR(ret);
+	if (!ret)
+		nctrl->admin_tagset = set;
+	return ret;
+}
 
-	return set;
+static int nvme_tcp_alloc_tag_set(struct nvme_ctrl *nctrl)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+	struct blk_mq_tag_set *set = &ctrl->tag_set;
+	int ret;
+
+	memset(set, 0, sizeof(*set));
+	set->ops = &nvme_tcp_mq_ops;
+	set->queue_depth = nctrl->sqsize + 1;
+	set->reserved_tags = NVMF_RESERVED_TAGS;
+	set->numa_node = nctrl->numa_node;
+	set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
+	set->cmd_size = sizeof(struct nvme_tcp_request);
+	set->driver_data = ctrl;
+	set->nr_hw_queues = nctrl->queue_count - 1;
+	set->timeout = NVME_IO_TIMEOUT;
+	set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
+	ret = blk_mq_alloc_tag_set(set);
+	if (!ret)
+		nctrl->tagset = set;
+	return ret;
 }
 
 static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
@@ -1899,11 +1905,9 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
 		return ret;
 
 	if (new) {
-		ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
-		if (IS_ERR(ctrl->tagset)) {
-			ret = PTR_ERR(ctrl->tagset);
+		ret = nvme_tcp_alloc_tag_set(ctrl);
+		if (ret)
 			goto out_free_io_queues;
-		}
 
 		ret = nvme_ctrl_init_connect_q(ctrl);
 		if (ret)
@@ -1968,11 +1972,9 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
 		return error;
 
 	if (new) {
-		ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
-		if (IS_ERR(ctrl->admin_tagset)) {
-			error = PTR_ERR(ctrl->admin_tagset);
+		error = nvme_tcp_alloc_admin_tag_set(ctrl);
+		if (error)
 			goto out_free_queue;
-		}
 
 		ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
 		if (IS_ERR(ctrl->fabrics_q)) {
@@ -2173,6 +2175,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
 				struct nvme_tcp_ctrl, err_work);
 	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
 
+	nvme_auth_stop(ctrl);
 	nvme_stop_keep_alive(ctrl);
 	flush_work(&ctrl->async_event_work);
 	nvme_tcp_teardown_io_queues(ctrl, false);
@@ -2371,7 +2374,7 @@ static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
 	if (!blk_rq_nr_phys_segments(rq))
 		nvme_tcp_set_sg_null(c);
 	else if (rq_data_dir(rq) == WRITE &&
-	    req->data_len <= nvme_tcp_inline_data_size(queue))
+	    req->data_len <= nvme_tcp_inline_data_size(req))
 		nvme_tcp_set_sg_inline(queue, c, req->data_len);
 	else
 		nvme_tcp_set_sg_host_data(c, req->data_len);
@@ -2406,7 +2409,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
 		nvme_tcp_init_iter(req, rq_data_dir(rq));
 
 	if (rq_data_dir(rq) == WRITE &&
-	    req->data_len <= nvme_tcp_inline_data_size(queue))
+	    req->data_len <= nvme_tcp_inline_data_size(req))
 		req->pdu_len = req->data_len;
 
 	pdu->hdr.type = nvme_tcp_cmd;
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 2a89c5aa0790..1c36fcedea20 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -287,6 +287,34 @@ static const char *nvme_trace_fabrics_property_get(struct trace_seq *p, u8 *spc)
 	return ret;
 }
 
+static const char *nvme_trace_fabrics_auth_send(struct trace_seq *p, u8 *spc)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u8 spsp0 = spc[1];
+	u8 spsp1 = spc[2];
+	u8 secp = spc[3];
+	u32 tl = get_unaligned_le32(spc + 4);
+
+	trace_seq_printf(p, "spsp0=%02x, spsp1=%02x, secp=%02x, tl=%u",
+			 spsp0, spsp1, secp, tl);
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+static const char *nvme_trace_fabrics_auth_receive(struct trace_seq *p, u8 *spc)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u8 spsp0 = spc[1];
+	u8 spsp1 = spc[2];
+	u8 secp = spc[3];
+	u32 al = get_unaligned_le32(spc + 4);
+
+	trace_seq_printf(p, "spsp0=%02x, spsp1=%02x, secp=%02x, al=%u",
+			 spsp0, spsp1, secp, al);
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
 static const char *nvme_trace_fabrics_common(struct trace_seq *p, u8 *spc)
 {
 	const char *ret = trace_seq_buffer_ptr(p);
@@ -306,6 +334,10 @@ const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p,
 		return nvme_trace_fabrics_connect(p, spc);
 	case nvme_fabrics_type_property_get:
 		return nvme_trace_fabrics_property_get(p, spc);
+	case nvme_fabrics_type_auth_send:
+		return nvme_trace_fabrics_auth_send(p, spc);
+	case nvme_fabrics_type_auth_receive:
+		return nvme_trace_fabrics_auth_receive(p, spc);
 	default:
 		return nvme_trace_fabrics_common(p, spc);
 	}
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
index 37c7f4c89f92..6f0eaf6a1528 100644
--- a/drivers/nvme/host/trace.h
+++ b/drivers/nvme/host/trace.h
@@ -98,7 +98,7 @@ TRACE_EVENT(nvme_complete_rq,
 	    TP_fast_assign(
 		__entry->ctrl_id = nvme_req(req)->ctrl->instance;
 		__entry->qid = nvme_req_qid(req);
-		__entry->cid = req->tag;
+		__entry->cid = nvme_req(req)->cmd->common.command_id;
 		__entry->result = le64_to_cpu(nvme_req(req)->result.u64);
 		__entry->retries = nvme_req(req)->retries;
 		__entry->flags = nvme_req(req)->flags;
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index 973561c93888..79fc64035ee3 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -83,3 +83,18 @@ config NVME_TARGET_TCP
 	  devices over TCP.
 
 	  If unsure, say N.
+
+config NVME_TARGET_AUTH
+	bool "NVMe over Fabrics In-band Authentication support"
+	depends on NVME_TARGET
+	select NVME_COMMON
+	select CRYPTO
+	select CRYPTO_HMAC
+	select CRYPTO_SHA256
+	select CRYPTO_SHA512
+	select CRYPTO_DH
+	select CRYPTO_DH_RFC7919_GROUPS
+	help
+	  This enables support for NVMe over Fabrics In-band Authentication
+
+	  If unsure, say N.
diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index 9837e580fa7e..c66820102493 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -13,6 +13,7 @@ nvmet-y		+= core.o configfs.o admin-cmd.o fabrics-cmd.o \
 			discovery.o io-cmd-file.o io-cmd-bdev.o
 nvmet-$(CONFIG_NVME_TARGET_PASSTHRU)	+= passthru.o
 nvmet-$(CONFIG_BLK_DEV_ZONED)		+= zns.o
+nvmet-$(CONFIG_NVME_TARGET_AUTH)	+= fabrics-cmd-auth.o auth.o
 nvme-loop-y	+= loop.o
 nvmet-rdma-y	+= rdma.o
 nvmet-fc-y	+= fc.o
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 397daaf51f1b..fc8a957fad0a 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -1017,7 +1017,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
 	u16 ret;
 
 	if (nvme_is_fabrics(cmd))
-		return nvmet_parse_fabrics_cmd(req);
+		return nvmet_parse_fabrics_admin_cmd(req);
+	if (unlikely(!nvmet_check_auth_status(req)))
+		return NVME_SC_AUTH_REQUIRED | NVME_SC_DNR;
 	if (nvmet_is_disc_subsys(nvmet_req_subsys(req)))
 		return nvmet_parse_discovery_cmd(req);
 
diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c
new file mode 100644
index 000000000000..cf690df34775
--- /dev/null
+++ b/drivers/nvme/target/auth.c
@@ -0,0 +1,525 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe over Fabrics DH-HMAC-CHAP authentication.
+ * Copyright (c) 2020 Hannes Reinecke, SUSE Software Solutions.
+ * All rights reserved.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <crypto/hash.h>
+#include <linux/crc32.h>
+#include <linux/base64.h>
+#include <linux/ctype.h>
+#include <linux/random.h>
+#include <linux/nvme-auth.h>
+#include <asm/unaligned.h>
+
+#include "nvmet.h"
+
+int nvmet_auth_set_key(struct nvmet_host *host, const char *secret,
+		       bool set_ctrl)
+{
+	unsigned char key_hash;
+	char *dhchap_secret;
+
+	if (sscanf(secret, "DHHC-1:%hhd:%*s", &key_hash) != 1)
+		return -EINVAL;
+	if (key_hash > 3) {
+		pr_warn("Invalid DH-HMAC-CHAP hash id %d\n",
+			 key_hash);
+		return -EINVAL;
+	}
+	if (key_hash > 0) {
+		/* Validate selected hash algorithm */
+		const char *hmac = nvme_auth_hmac_name(key_hash);
+
+		if (!crypto_has_shash(hmac, 0, 0)) {
+			pr_err("DH-HMAC-CHAP hash %s unsupported\n", hmac);
+			return -ENOTSUPP;
+		}
+	}
+	dhchap_secret = kstrdup(secret, GFP_KERNEL);
+	if (!dhchap_secret)
+		return -ENOMEM;
+	if (set_ctrl) {
+		host->dhchap_ctrl_secret = strim(dhchap_secret);
+		host->dhchap_ctrl_key_hash = key_hash;
+	} else {
+		host->dhchap_secret = strim(dhchap_secret);
+		host->dhchap_key_hash = key_hash;
+	}
+	return 0;
+}
+
+int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id)
+{
+	const char *dhgroup_kpp;
+	int ret = 0;
+
+	pr_debug("%s: ctrl %d selecting dhgroup %d\n",
+		 __func__, ctrl->cntlid, dhgroup_id);
+
+	if (ctrl->dh_tfm) {
+		if (ctrl->dh_gid == dhgroup_id) {
+			pr_debug("%s: ctrl %d reuse existing DH group %d\n",
+				 __func__, ctrl->cntlid, dhgroup_id);
+			return 0;
+		}
+		crypto_free_kpp(ctrl->dh_tfm);
+		ctrl->dh_tfm = NULL;
+		ctrl->dh_gid = 0;
+	}
+
+	if (dhgroup_id == NVME_AUTH_DHGROUP_NULL)
+		return 0;
+
+	dhgroup_kpp = nvme_auth_dhgroup_kpp(dhgroup_id);
+	if (!dhgroup_kpp) {
+		pr_debug("%s: ctrl %d invalid DH group %d\n",
+			 __func__, ctrl->cntlid, dhgroup_id);
+		return -EINVAL;
+	}
+	ctrl->dh_tfm = crypto_alloc_kpp(dhgroup_kpp, 0, 0);
+	if (IS_ERR(ctrl->dh_tfm)) {
+		pr_debug("%s: ctrl %d failed to setup DH group %d, err %ld\n",
+			 __func__, ctrl->cntlid, dhgroup_id,
+			 PTR_ERR(ctrl->dh_tfm));
+		ret = PTR_ERR(ctrl->dh_tfm);
+		ctrl->dh_tfm = NULL;
+		ctrl->dh_gid = 0;
+	} else {
+		ctrl->dh_gid = dhgroup_id;
+		pr_debug("%s: ctrl %d setup DH group %d\n",
+			 __func__, ctrl->cntlid, ctrl->dh_gid);
+		ret = nvme_auth_gen_privkey(ctrl->dh_tfm, ctrl->dh_gid);
+		if (ret < 0) {
+			pr_debug("%s: ctrl %d failed to generate private key, err %d\n",
+				 __func__, ctrl->cntlid, ret);
+			kfree_sensitive(ctrl->dh_key);
+			return ret;
+		}
+		ctrl->dh_keysize = crypto_kpp_maxsize(ctrl->dh_tfm);
+		kfree_sensitive(ctrl->dh_key);
+		ctrl->dh_key = kzalloc(ctrl->dh_keysize, GFP_KERNEL);
+		if (!ctrl->dh_key) {
+			pr_warn("ctrl %d failed to allocate public key\n",
+				ctrl->cntlid);
+			return -ENOMEM;
+		}
+		ret = nvme_auth_gen_pubkey(ctrl->dh_tfm, ctrl->dh_key,
+					   ctrl->dh_keysize);
+		if (ret < 0) {
+			pr_warn("ctrl %d failed to generate public key\n",
+				ctrl->cntlid);
+			kfree(ctrl->dh_key);
+			ctrl->dh_key = NULL;
+		}
+	}
+
+	return ret;
+}
+
+int nvmet_setup_auth(struct nvmet_ctrl *ctrl)
+{
+	int ret = 0;
+	struct nvmet_host_link *p;
+	struct nvmet_host *host = NULL;
+	const char *hash_name;
+
+	down_read(&nvmet_config_sem);
+	if (nvmet_is_disc_subsys(ctrl->subsys))
+		goto out_unlock;
+
+	if (ctrl->subsys->allow_any_host)
+		goto out_unlock;
+
+	list_for_each_entry(p, &ctrl->subsys->hosts, entry) {
+		pr_debug("check %s\n", nvmet_host_name(p->host));
+		if (strcmp(nvmet_host_name(p->host), ctrl->hostnqn))
+			continue;
+		host = p->host;
+		break;
+	}
+	if (!host) {
+		pr_debug("host %s not found\n", ctrl->hostnqn);
+		ret = -EPERM;
+		goto out_unlock;
+	}
+
+	ret = nvmet_setup_dhgroup(ctrl, host->dhchap_dhgroup_id);
+	if (ret < 0)
+		pr_warn("Failed to setup DH group");
+
+	if (!host->dhchap_secret) {
+		pr_debug("No authentication provided\n");
+		goto out_unlock;
+	}
+
+	if (host->dhchap_hash_id == ctrl->shash_id) {
+		pr_debug("Re-use existing hash ID %d\n",
+			 ctrl->shash_id);
+	} else {
+		hash_name = nvme_auth_hmac_name(host->dhchap_hash_id);
+		if (!hash_name) {
+			pr_warn("Hash ID %d invalid\n", host->dhchap_hash_id);
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+		ctrl->shash_id = host->dhchap_hash_id;
+	}
+
+	/* Skip the 'DHHC-1:XX:' prefix */
+	nvme_auth_free_key(ctrl->host_key);
+	ctrl->host_key = nvme_auth_extract_key(host->dhchap_secret + 10,
+					       host->dhchap_key_hash);
+	if (IS_ERR(ctrl->host_key)) {
+		ret = PTR_ERR(ctrl->host_key);
+		ctrl->host_key = NULL;
+		goto out_free_hash;
+	}
+	pr_debug("%s: using hash %s key %*ph\n", __func__,
+		 ctrl->host_key->hash > 0 ?
+		 nvme_auth_hmac_name(ctrl->host_key->hash) : "none",
+		 (int)ctrl->host_key->len, ctrl->host_key->key);
+
+	nvme_auth_free_key(ctrl->ctrl_key);
+	if (!host->dhchap_ctrl_secret) {
+		ctrl->ctrl_key = NULL;
+		goto out_unlock;
+	}
+
+	ctrl->ctrl_key = nvme_auth_extract_key(host->dhchap_ctrl_secret + 10,
+					       host->dhchap_ctrl_key_hash);
+	if (IS_ERR(ctrl->ctrl_key)) {
+		ret = PTR_ERR(ctrl->ctrl_key);
+		ctrl->ctrl_key = NULL;
+	}
+	pr_debug("%s: using ctrl hash %s key %*ph\n", __func__,
+		 ctrl->ctrl_key->hash > 0 ?
+		 nvme_auth_hmac_name(ctrl->ctrl_key->hash) : "none",
+		 (int)ctrl->ctrl_key->len, ctrl->ctrl_key->key);
+
+out_free_hash:
+	if (ret) {
+		if (ctrl->host_key) {
+			nvme_auth_free_key(ctrl->host_key);
+			ctrl->host_key = NULL;
+		}
+		ctrl->shash_id = 0;
+	}
+out_unlock:
+	up_read(&nvmet_config_sem);
+
+	return ret;
+}
+
+void nvmet_auth_sq_free(struct nvmet_sq *sq)
+{
+	cancel_delayed_work(&sq->auth_expired_work);
+	kfree(sq->dhchap_c1);
+	sq->dhchap_c1 = NULL;
+	kfree(sq->dhchap_c2);
+	sq->dhchap_c2 = NULL;
+	kfree(sq->dhchap_skey);
+	sq->dhchap_skey = NULL;
+}
+
+void nvmet_destroy_auth(struct nvmet_ctrl *ctrl)
+{
+	ctrl->shash_id = 0;
+
+	if (ctrl->dh_tfm) {
+		crypto_free_kpp(ctrl->dh_tfm);
+		ctrl->dh_tfm = NULL;
+		ctrl->dh_gid = 0;
+	}
+	kfree_sensitive(ctrl->dh_key);
+	ctrl->dh_key = NULL;
+
+	if (ctrl->host_key) {
+		nvme_auth_free_key(ctrl->host_key);
+		ctrl->host_key = NULL;
+	}
+	if (ctrl->ctrl_key) {
+		nvme_auth_free_key(ctrl->ctrl_key);
+		ctrl->ctrl_key = NULL;
+	}
+}
+
+bool nvmet_check_auth_status(struct nvmet_req *req)
+{
+	if (req->sq->ctrl->host_key &&
+	    !req->sq->authenticated)
+		return false;
+	return true;
+}
+
+int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
+			 unsigned int shash_len)
+{
+	struct crypto_shash *shash_tfm;
+	struct shash_desc *shash;
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	const char *hash_name;
+	u8 *challenge = req->sq->dhchap_c1, *host_response;
+	u8 buf[4];
+	int ret;
+
+	hash_name = nvme_auth_hmac_name(ctrl->shash_id);
+	if (!hash_name) {
+		pr_warn("Hash ID %d invalid\n", ctrl->shash_id);
+		return -EINVAL;
+	}
+
+	shash_tfm = crypto_alloc_shash(hash_name, 0, 0);
+	if (IS_ERR(shash_tfm)) {
+		pr_err("failed to allocate shash %s\n", hash_name);
+		return PTR_ERR(shash_tfm);
+	}
+
+	if (shash_len != crypto_shash_digestsize(shash_tfm)) {
+		pr_debug("%s: hash len mismatch (len %d digest %d)\n",
+			 __func__, shash_len,
+			 crypto_shash_digestsize(shash_tfm));
+		ret = -EINVAL;
+		goto out_free_tfm;
+	}
+
+	host_response = nvme_auth_transform_key(ctrl->host_key, ctrl->hostnqn);
+	if (IS_ERR(host_response)) {
+		ret = PTR_ERR(host_response);
+		goto out_free_tfm;
+	}
+
+	ret = crypto_shash_setkey(shash_tfm, host_response,
+				  ctrl->host_key->len);
+	if (ret)
+		goto out_free_response;
+
+	if (ctrl->dh_gid != NVME_AUTH_DHGROUP_NULL) {
+		challenge = kmalloc(shash_len, GFP_KERNEL);
+		if (!challenge) {
+			ret = -ENOMEM;
+			goto out_free_response;
+		}
+		ret = nvme_auth_augmented_challenge(ctrl->shash_id,
+						    req->sq->dhchap_skey,
+						    req->sq->dhchap_skey_len,
+						    req->sq->dhchap_c1,
+						    challenge, shash_len);
+		if (ret)
+			goto out_free_response;
+	}
+
+	pr_debug("ctrl %d qid %d host response seq %u transaction %d\n",
+		 ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1,
+		 req->sq->dhchap_tid);
+
+	shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm),
+			GFP_KERNEL);
+	if (!shash) {
+		ret = -ENOMEM;
+		goto out_free_response;
+	}
+	shash->tfm = shash_tfm;
+	ret = crypto_shash_init(shash);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, challenge, shash_len);
+	if (ret)
+		goto out;
+	put_unaligned_le32(req->sq->dhchap_s1, buf);
+	ret = crypto_shash_update(shash, buf, 4);
+	if (ret)
+		goto out;
+	put_unaligned_le16(req->sq->dhchap_tid, buf);
+	ret = crypto_shash_update(shash, buf, 2);
+	if (ret)
+		goto out;
+	memset(buf, 0, 4);
+	ret = crypto_shash_update(shash, buf, 1);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, "HostHost", 8);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, ctrl->hostnqn, strlen(ctrl->hostnqn));
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, buf, 1);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, ctrl->subsysnqn,
+				  strlen(ctrl->subsysnqn));
+	if (ret)
+		goto out;
+	ret = crypto_shash_final(shash, response);
+out:
+	if (challenge != req->sq->dhchap_c1)
+		kfree(challenge);
+	kfree(shash);
+out_free_response:
+	kfree_sensitive(host_response);
+out_free_tfm:
+	crypto_free_shash(shash_tfm);
+	return 0;
+}
+
+int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response,
+			 unsigned int shash_len)
+{
+	struct crypto_shash *shash_tfm;
+	struct shash_desc *shash;
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	const char *hash_name;
+	u8 *challenge = req->sq->dhchap_c2, *ctrl_response;
+	u8 buf[4];
+	int ret;
+
+	hash_name = nvme_auth_hmac_name(ctrl->shash_id);
+	if (!hash_name) {
+		pr_warn("Hash ID %d invalid\n", ctrl->shash_id);
+		return -EINVAL;
+	}
+
+	shash_tfm = crypto_alloc_shash(hash_name, 0, 0);
+	if (IS_ERR(shash_tfm)) {
+		pr_err("failed to allocate shash %s\n", hash_name);
+		return PTR_ERR(shash_tfm);
+	}
+
+	if (shash_len != crypto_shash_digestsize(shash_tfm)) {
+		pr_debug("%s: hash len mismatch (len %d digest %d)\n",
+			 __func__, shash_len,
+			 crypto_shash_digestsize(shash_tfm));
+		ret = -EINVAL;
+		goto out_free_tfm;
+	}
+
+	ctrl_response = nvme_auth_transform_key(ctrl->ctrl_key,
+						ctrl->subsysnqn);
+	if (IS_ERR(ctrl_response)) {
+		ret = PTR_ERR(ctrl_response);
+		goto out_free_tfm;
+	}
+
+	ret = crypto_shash_setkey(shash_tfm, ctrl_response,
+				  ctrl->ctrl_key->len);
+	if (ret)
+		goto out_free_response;
+
+	if (ctrl->dh_gid != NVME_AUTH_DHGROUP_NULL) {
+		challenge = kmalloc(shash_len, GFP_KERNEL);
+		if (!challenge) {
+			ret = -ENOMEM;
+			goto out_free_response;
+		}
+		ret = nvme_auth_augmented_challenge(ctrl->shash_id,
+						    req->sq->dhchap_skey,
+						    req->sq->dhchap_skey_len,
+						    req->sq->dhchap_c2,
+						    challenge, shash_len);
+		if (ret)
+			goto out_free_response;
+	}
+
+	shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm),
+			GFP_KERNEL);
+	if (!shash) {
+		ret = -ENOMEM;
+		goto out_free_response;
+	}
+	shash->tfm = shash_tfm;
+
+	ret = crypto_shash_init(shash);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, challenge, shash_len);
+	if (ret)
+		goto out;
+	put_unaligned_le32(req->sq->dhchap_s2, buf);
+	ret = crypto_shash_update(shash, buf, 4);
+	if (ret)
+		goto out;
+	put_unaligned_le16(req->sq->dhchap_tid, buf);
+	ret = crypto_shash_update(shash, buf, 2);
+	if (ret)
+		goto out;
+	memset(buf, 0, 4);
+	ret = crypto_shash_update(shash, buf, 1);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, "Controller", 10);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, ctrl->subsysnqn,
+			    strlen(ctrl->subsysnqn));
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, buf, 1);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, ctrl->hostnqn, strlen(ctrl->hostnqn));
+	if (ret)
+		goto out;
+	ret = crypto_shash_final(shash, response);
+out:
+	if (challenge != req->sq->dhchap_c2)
+		kfree(challenge);
+	kfree(shash);
+out_free_response:
+	kfree_sensitive(ctrl_response);
+out_free_tfm:
+	crypto_free_shash(shash_tfm);
+	return 0;
+}
+
+int nvmet_auth_ctrl_exponential(struct nvmet_req *req,
+				u8 *buf, int buf_size)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	int ret = 0;
+
+	if (!ctrl->dh_key) {
+		pr_warn("ctrl %d no DH public key!\n", ctrl->cntlid);
+		return -ENOKEY;
+	}
+	if (buf_size != ctrl->dh_keysize) {
+		pr_warn("ctrl %d DH public key size mismatch, need %zu is %d\n",
+			ctrl->cntlid, ctrl->dh_keysize, buf_size);
+		ret = -EINVAL;
+	} else {
+		memcpy(buf, ctrl->dh_key, buf_size);
+		pr_debug("%s: ctrl %d public key %*ph\n", __func__,
+			 ctrl->cntlid, (int)buf_size, buf);
+	}
+
+	return ret;
+}
+
+int nvmet_auth_ctrl_sesskey(struct nvmet_req *req,
+			    u8 *pkey, int pkey_size)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	int ret;
+
+	req->sq->dhchap_skey_len = ctrl->dh_keysize;
+	req->sq->dhchap_skey = kzalloc(req->sq->dhchap_skey_len, GFP_KERNEL);
+	if (!req->sq->dhchap_skey)
+		return -ENOMEM;
+	ret = nvme_auth_gen_shared_secret(ctrl->dh_tfm,
+					  pkey, pkey_size,
+					  req->sq->dhchap_skey,
+					  req->sq->dhchap_skey_len);
+	if (ret)
+		pr_debug("failed to compute shared secret, err %d\n", ret);
+	else
+		pr_debug("%s: shared secret %*ph\n", __func__,
+			 (int)req->sq->dhchap_skey_len,
+			 req->sq->dhchap_skey);
+
+	return ret;
+}
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index ff77c3d2354f..2bcd60758919 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -11,6 +11,11 @@
 #include <linux/ctype.h>
 #include <linux/pci.h>
 #include <linux/pci-p2pdma.h>
+#ifdef CONFIG_NVME_TARGET_AUTH
+#include <linux/nvme-auth.h>
+#endif
+#include <crypto/hash.h>
+#include <crypto/kpp.h>
 
 #include "nvmet.h"
 
@@ -1680,10 +1685,133 @@ static const struct config_item_type nvmet_ports_type = {
 static struct config_group nvmet_subsystems_group;
 static struct config_group nvmet_ports_group;
 
+#ifdef CONFIG_NVME_TARGET_AUTH
+static ssize_t nvmet_host_dhchap_key_show(struct config_item *item,
+		char *page)
+{
+	u8 *dhchap_secret = to_host(item)->dhchap_secret;
+
+	if (!dhchap_secret)
+		return sprintf(page, "\n");
+	return sprintf(page, "%s\n", dhchap_secret);
+}
+
+static ssize_t nvmet_host_dhchap_key_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_host *host = to_host(item);
+	int ret;
+
+	ret = nvmet_auth_set_key(host, page, false);
+	/*
+	 * Re-authentication is a soft state, so keep the
+	 * current authentication valid until the host
+	 * requests re-authentication.
+	 */
+	return ret < 0 ? ret : count;
+}
+
+CONFIGFS_ATTR(nvmet_host_, dhchap_key);
+
+static ssize_t nvmet_host_dhchap_ctrl_key_show(struct config_item *item,
+		char *page)
+{
+	u8 *dhchap_secret = to_host(item)->dhchap_ctrl_secret;
+
+	if (!dhchap_secret)
+		return sprintf(page, "\n");
+	return sprintf(page, "%s\n", dhchap_secret);
+}
+
+static ssize_t nvmet_host_dhchap_ctrl_key_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_host *host = to_host(item);
+	int ret;
+
+	ret = nvmet_auth_set_key(host, page, true);
+	/*
+	 * Re-authentication is a soft state, so keep the
+	 * current authentication valid until the host
+	 * requests re-authentication.
+	 */
+	return ret < 0 ? ret : count;
+}
+
+CONFIGFS_ATTR(nvmet_host_, dhchap_ctrl_key);
+
+static ssize_t nvmet_host_dhchap_hash_show(struct config_item *item,
+		char *page)
+{
+	struct nvmet_host *host = to_host(item);
+	const char *hash_name = nvme_auth_hmac_name(host->dhchap_hash_id);
+
+	return sprintf(page, "%s\n", hash_name ? hash_name : "none");
+}
+
+static ssize_t nvmet_host_dhchap_hash_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_host *host = to_host(item);
+	u8 hmac_id;
+
+	hmac_id = nvme_auth_hmac_id(page);
+	if (hmac_id == NVME_AUTH_HASH_INVALID)
+		return -EINVAL;
+	if (!crypto_has_shash(nvme_auth_hmac_name(hmac_id), 0, 0))
+		return -ENOTSUPP;
+	host->dhchap_hash_id = hmac_id;
+	return count;
+}
+
+CONFIGFS_ATTR(nvmet_host_, dhchap_hash);
+
+static ssize_t nvmet_host_dhchap_dhgroup_show(struct config_item *item,
+		char *page)
+{
+	struct nvmet_host *host = to_host(item);
+	const char *dhgroup = nvme_auth_dhgroup_name(host->dhchap_dhgroup_id);
+
+	return sprintf(page, "%s\n", dhgroup ? dhgroup : "none");
+}
+
+static ssize_t nvmet_host_dhchap_dhgroup_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_host *host = to_host(item);
+	int dhgroup_id;
+
+	dhgroup_id = nvme_auth_dhgroup_id(page);
+	if (dhgroup_id == NVME_AUTH_DHGROUP_INVALID)
+		return -EINVAL;
+	if (dhgroup_id != NVME_AUTH_DHGROUP_NULL) {
+		const char *kpp = nvme_auth_dhgroup_kpp(dhgroup_id);
+
+		if (!crypto_has_kpp(kpp, 0, 0))
+			return -EINVAL;
+	}
+	host->dhchap_dhgroup_id = dhgroup_id;
+	return count;
+}
+
+CONFIGFS_ATTR(nvmet_host_, dhchap_dhgroup);
+
+static struct configfs_attribute *nvmet_host_attrs[] = {
+	&nvmet_host_attr_dhchap_key,
+	&nvmet_host_attr_dhchap_ctrl_key,
+	&nvmet_host_attr_dhchap_hash,
+	&nvmet_host_attr_dhchap_dhgroup,
+	NULL,
+};
+#endif /* CONFIG_NVME_TARGET_AUTH */
+
 static void nvmet_host_release(struct config_item *item)
 {
 	struct nvmet_host *host = to_host(item);
 
+#ifdef CONFIG_NVME_TARGET_AUTH
+	kfree(host->dhchap_secret);
+#endif
 	kfree(host);
 }
 
@@ -1693,6 +1821,9 @@ static struct configfs_item_operations nvmet_host_item_ops = {
 
 static const struct config_item_type nvmet_host_type = {
 	.ct_item_ops		= &nvmet_host_item_ops,
+#ifdef CONFIG_NVME_TARGET_AUTH
+	.ct_attrs		= nvmet_host_attrs,
+#endif
 	.ct_owner		= THIS_MODULE,
 };
 
@@ -1705,6 +1836,11 @@ static struct config_group *nvmet_hosts_make_group(struct config_group *group,
 	if (!host)
 		return ERR_PTR(-ENOMEM);
 
+#ifdef CONFIG_NVME_TARGET_AUTH
+	/* Default to SHA256 */
+	host->dhchap_hash_id = NVME_AUTH_HASH_SHA256;
+#endif
+
 	config_group_init_type_name(&host->group, name, &nvmet_host_type);
 
 	return &host->group;
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index c27660a660d9..a1345790005f 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -795,6 +795,7 @@ void nvmet_sq_destroy(struct nvmet_sq *sq)
 	wait_for_completion(&sq->confirm_done);
 	wait_for_completion(&sq->free_done);
 	percpu_ref_exit(&sq->ref);
+	nvmet_auth_sq_free(sq);
 
 	if (ctrl) {
 		/*
@@ -865,8 +866,15 @@ static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req)
 
 static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
 {
+	struct nvme_command *cmd = req->cmd;
 	u16 ret;
 
+	if (nvme_is_fabrics(cmd))
+		return nvmet_parse_fabrics_io_cmd(req);
+
+	if (unlikely(!nvmet_check_auth_status(req)))
+		return NVME_SC_AUTH_REQUIRED | NVME_SC_DNR;
+
 	ret = nvmet_check_ctrl_status(req);
 	if (unlikely(ret))
 		return ret;
@@ -1271,6 +1279,11 @@ u16 nvmet_check_ctrl_status(struct nvmet_req *req)
 		       req->cmd->common.opcode, req->sq->qid);
 		return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
 	}
+
+	if (unlikely(!nvmet_check_auth_status(req))) {
+		pr_warn("qid %d not authenticated\n", req->sq->qid);
+		return NVME_SC_AUTH_REQUIRED | NVME_SC_DNR;
+	}
 	return 0;
 }
 
@@ -1467,6 +1480,8 @@ static void nvmet_ctrl_free(struct kref *ref)
 	flush_work(&ctrl->async_event_work);
 	cancel_work_sync(&ctrl->fatal_err_work);
 
+	nvmet_destroy_auth(ctrl);
+
 	ida_free(&cntlid_ida, ctrl->cntlid);
 
 	nvmet_async_events_free(ctrl);
diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c
new file mode 100644
index 000000000000..c851814d6cb0
--- /dev/null
+++ b/drivers/nvme/target/fabrics-cmd-auth.c
@@ -0,0 +1,544 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe over Fabrics DH-HMAC-CHAP authentication command handling.
+ * Copyright (c) 2020 Hannes Reinecke, SUSE Software Solutions.
+ * All rights reserved.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/nvme-auth.h>
+#include <crypto/hash.h>
+#include <crypto/kpp.h>
+#include "nvmet.h"
+
+static void nvmet_auth_expired_work(struct work_struct *work)
+{
+	struct nvmet_sq *sq = container_of(to_delayed_work(work),
+			struct nvmet_sq, auth_expired_work);
+
+	pr_debug("%s: ctrl %d qid %d transaction %u expired, resetting\n",
+		 __func__, sq->ctrl->cntlid, sq->qid, sq->dhchap_tid);
+	sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE;
+	sq->dhchap_tid = -1;
+}
+
+void nvmet_init_auth(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
+{
+	u32 result = le32_to_cpu(req->cqe->result.u32);
+
+	/* Initialize in-band authentication */
+	INIT_DELAYED_WORK(&req->sq->auth_expired_work,
+			  nvmet_auth_expired_work);
+	req->sq->authenticated = false;
+	req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE;
+	result |= (u32)NVME_CONNECT_AUTHREQ_ATR << 16;
+	req->cqe->result.u32 = cpu_to_le32(result);
+}
+
+static u16 nvmet_auth_negotiate(struct nvmet_req *req, void *d)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmf_auth_dhchap_negotiate_data *data = d;
+	int i, hash_id = 0, fallback_hash_id = 0, dhgid, fallback_dhgid;
+
+	pr_debug("%s: ctrl %d qid %d: data sc_d %d napd %d authid %d halen %d dhlen %d\n",
+		 __func__, ctrl->cntlid, req->sq->qid,
+		 data->sc_c, data->napd, data->auth_protocol[0].dhchap.authid,
+		 data->auth_protocol[0].dhchap.halen,
+		 data->auth_protocol[0].dhchap.dhlen);
+	req->sq->dhchap_tid = le16_to_cpu(data->t_id);
+	if (data->sc_c)
+		return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH;
+
+	if (data->napd != 1)
+		return NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
+
+	if (data->auth_protocol[0].dhchap.authid !=
+	    NVME_AUTH_DHCHAP_AUTH_ID)
+		return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+
+	for (i = 0; i < data->auth_protocol[0].dhchap.halen; i++) {
+		u8 host_hmac_id = data->auth_protocol[0].dhchap.idlist[i];
+
+		if (!fallback_hash_id &&
+		    crypto_has_shash(nvme_auth_hmac_name(host_hmac_id), 0, 0))
+			fallback_hash_id = host_hmac_id;
+		if (ctrl->shash_id != host_hmac_id)
+			continue;
+		hash_id = ctrl->shash_id;
+		break;
+	}
+	if (hash_id == 0) {
+		if (fallback_hash_id == 0) {
+			pr_debug("%s: ctrl %d qid %d: no usable hash found\n",
+				 __func__, ctrl->cntlid, req->sq->qid);
+			return NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
+		}
+		pr_debug("%s: ctrl %d qid %d: no usable hash found, falling back to %s\n",
+			 __func__, ctrl->cntlid, req->sq->qid,
+			 nvme_auth_hmac_name(fallback_hash_id));
+		ctrl->shash_id = fallback_hash_id;
+	}
+
+	dhgid = -1;
+	fallback_dhgid = -1;
+	for (i = 0; i < data->auth_protocol[0].dhchap.dhlen; i++) {
+		int tmp_dhgid = data->auth_protocol[0].dhchap.idlist[i + 30];
+
+		if (tmp_dhgid != ctrl->dh_gid) {
+			dhgid = tmp_dhgid;
+			break;
+		}
+		if (fallback_dhgid < 0) {
+			const char *kpp = nvme_auth_dhgroup_kpp(tmp_dhgid);
+
+			if (crypto_has_kpp(kpp, 0, 0))
+				fallback_dhgid = tmp_dhgid;
+		}
+	}
+	if (dhgid < 0) {
+		if (fallback_dhgid < 0) {
+			pr_debug("%s: ctrl %d qid %d: no usable DH group found\n",
+				 __func__, ctrl->cntlid, req->sq->qid);
+			return NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+		}
+		pr_debug("%s: ctrl %d qid %d: configured DH group %s not found\n",
+			 __func__, ctrl->cntlid, req->sq->qid,
+			 nvme_auth_dhgroup_name(fallback_dhgid));
+		ctrl->dh_gid = fallback_dhgid;
+	}
+	pr_debug("%s: ctrl %d qid %d: selected DH group %s (%d)\n",
+		 __func__, ctrl->cntlid, req->sq->qid,
+		 nvme_auth_dhgroup_name(ctrl->dh_gid), ctrl->dh_gid);
+	return 0;
+}
+
+static u16 nvmet_auth_reply(struct nvmet_req *req, void *d)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmf_auth_dhchap_reply_data *data = d;
+	u16 dhvlen = le16_to_cpu(data->dhvlen);
+	u8 *response;
+
+	pr_debug("%s: ctrl %d qid %d: data hl %d cvalid %d dhvlen %u\n",
+		 __func__, ctrl->cntlid, req->sq->qid,
+		 data->hl, data->cvalid, dhvlen);
+
+	if (dhvlen) {
+		if (!ctrl->dh_tfm)
+			return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		if (nvmet_auth_ctrl_sesskey(req, data->rval + 2 * data->hl,
+					    dhvlen) < 0)
+			return NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+	}
+
+	response = kmalloc(data->hl, GFP_KERNEL);
+	if (!response)
+		return NVME_AUTH_DHCHAP_FAILURE_FAILED;
+
+	if (!ctrl->host_key) {
+		pr_warn("ctrl %d qid %d no host key\n",
+			ctrl->cntlid, req->sq->qid);
+		kfree(response);
+		return NVME_AUTH_DHCHAP_FAILURE_FAILED;
+	}
+	if (nvmet_auth_host_hash(req, response, data->hl) < 0) {
+		pr_debug("ctrl %d qid %d host hash failed\n",
+			 ctrl->cntlid, req->sq->qid);
+		kfree(response);
+		return NVME_AUTH_DHCHAP_FAILURE_FAILED;
+	}
+
+	if (memcmp(data->rval, response, data->hl)) {
+		pr_info("ctrl %d qid %d host response mismatch\n",
+			ctrl->cntlid, req->sq->qid);
+		kfree(response);
+		return NVME_AUTH_DHCHAP_FAILURE_FAILED;
+	}
+	kfree(response);
+	pr_debug("%s: ctrl %d qid %d host authenticated\n",
+		 __func__, ctrl->cntlid, req->sq->qid);
+	if (data->cvalid) {
+		req->sq->dhchap_c2 = kmalloc(data->hl, GFP_KERNEL);
+		if (!req->sq->dhchap_c2)
+			return NVME_AUTH_DHCHAP_FAILURE_FAILED;
+		memcpy(req->sq->dhchap_c2, data->rval + data->hl, data->hl);
+
+		pr_debug("%s: ctrl %d qid %d challenge %*ph\n",
+			 __func__, ctrl->cntlid, req->sq->qid, data->hl,
+			 req->sq->dhchap_c2);
+		req->sq->dhchap_s2 = le32_to_cpu(data->seqnum);
+	} else {
+		req->sq->authenticated = true;
+		req->sq->dhchap_c2 = NULL;
+	}
+
+	return 0;
+}
+
+static u16 nvmet_auth_failure2(struct nvmet_req *req, void *d)
+{
+	struct nvmf_auth_dhchap_failure_data *data = d;
+
+	return data->rescode_exp;
+}
+
+void nvmet_execute_auth_send(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmf_auth_dhchap_success2_data *data;
+	void *d;
+	u32 tl;
+	u16 status = 0;
+
+	if (req->cmd->auth_send.secp != NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER) {
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		req->error_loc =
+			offsetof(struct nvmf_auth_send_command, secp);
+		goto done;
+	}
+	if (req->cmd->auth_send.spsp0 != 0x01) {
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		req->error_loc =
+			offsetof(struct nvmf_auth_send_command, spsp0);
+		goto done;
+	}
+	if (req->cmd->auth_send.spsp1 != 0x01) {
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		req->error_loc =
+			offsetof(struct nvmf_auth_send_command, spsp1);
+		goto done;
+	}
+	tl = le32_to_cpu(req->cmd->auth_send.tl);
+	if (!tl) {
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		req->error_loc =
+			offsetof(struct nvmf_auth_send_command, tl);
+		goto done;
+	}
+	if (!nvmet_check_transfer_len(req, tl)) {
+		pr_debug("%s: transfer length mismatch (%u)\n", __func__, tl);
+		return;
+	}
+
+	d = kmalloc(tl, GFP_KERNEL);
+	if (!d) {
+		status = NVME_SC_INTERNAL;
+		goto done;
+	}
+
+	status = nvmet_copy_from_sgl(req, 0, d, tl);
+	if (status) {
+		kfree(d);
+		goto done;
+	}
+
+	data = d;
+	pr_debug("%s: ctrl %d qid %d type %d id %d step %x\n", __func__,
+		 ctrl->cntlid, req->sq->qid, data->auth_type, data->auth_id,
+		 req->sq->dhchap_step);
+	if (data->auth_type != NVME_AUTH_COMMON_MESSAGES &&
+	    data->auth_type != NVME_AUTH_DHCHAP_MESSAGES)
+		goto done_failure1;
+	if (data->auth_type == NVME_AUTH_COMMON_MESSAGES) {
+		if (data->auth_id == NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE) {
+			/* Restart negotiation */
+			pr_debug("%s: ctrl %d qid %d reset negotiation\n", __func__,
+				 ctrl->cntlid, req->sq->qid);
+			if (!req->sq->qid) {
+				if (nvmet_setup_auth(ctrl) < 0) {
+					status = NVME_SC_INTERNAL;
+					pr_err("ctrl %d qid 0 failed to setup"
+					       "re-authentication",
+					       ctrl->cntlid);
+					goto done_failure1;
+				}
+			}
+			req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE;
+		} else if (data->auth_id != req->sq->dhchap_step)
+			goto done_failure1;
+		/* Validate negotiation parameters */
+		status = nvmet_auth_negotiate(req, d);
+		if (status == 0)
+			req->sq->dhchap_step =
+				NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE;
+		else {
+			req->sq->dhchap_step =
+				NVME_AUTH_DHCHAP_MESSAGE_FAILURE1;
+			req->sq->dhchap_status = status;
+			status = 0;
+		}
+		goto done_kfree;
+	}
+	if (data->auth_id != req->sq->dhchap_step) {
+		pr_debug("%s: ctrl %d qid %d step mismatch (%d != %d)\n",
+			 __func__, ctrl->cntlid, req->sq->qid,
+			 data->auth_id, req->sq->dhchap_step);
+		goto done_failure1;
+	}
+	if (le16_to_cpu(data->t_id) != req->sq->dhchap_tid) {
+		pr_debug("%s: ctrl %d qid %d invalid transaction %d (expected %d)\n",
+			 __func__, ctrl->cntlid, req->sq->qid,
+			 le16_to_cpu(data->t_id),
+			 req->sq->dhchap_tid);
+		req->sq->dhchap_step =
+			NVME_AUTH_DHCHAP_MESSAGE_FAILURE1;
+		req->sq->dhchap_status =
+			NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		goto done_kfree;
+	}
+
+	switch (data->auth_id) {
+	case NVME_AUTH_DHCHAP_MESSAGE_REPLY:
+		status = nvmet_auth_reply(req, d);
+		if (status == 0)
+			req->sq->dhchap_step =
+				NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1;
+		else {
+			req->sq->dhchap_step =
+				NVME_AUTH_DHCHAP_MESSAGE_FAILURE1;
+			req->sq->dhchap_status = status;
+			status = 0;
+		}
+		goto done_kfree;
+		break;
+	case NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2:
+		req->sq->authenticated = true;
+		pr_debug("%s: ctrl %d qid %d ctrl authenticated\n",
+			 __func__, ctrl->cntlid, req->sq->qid);
+		goto done_kfree;
+		break;
+	case NVME_AUTH_DHCHAP_MESSAGE_FAILURE2:
+		status = nvmet_auth_failure2(req, d);
+		if (status) {
+			pr_warn("ctrl %d qid %d: authentication failed (%d)\n",
+				ctrl->cntlid, req->sq->qid, status);
+			req->sq->dhchap_status = status;
+			req->sq->authenticated = false;
+			status = 0;
+		}
+		goto done_kfree;
+		break;
+	default:
+		req->sq->dhchap_status =
+			NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE;
+		req->sq->dhchap_step =
+			NVME_AUTH_DHCHAP_MESSAGE_FAILURE2;
+		req->sq->authenticated = false;
+		goto done_kfree;
+		break;
+	}
+done_failure1:
+	req->sq->dhchap_status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE;
+	req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_FAILURE2;
+
+done_kfree:
+	kfree(d);
+done:
+	pr_debug("%s: ctrl %d qid %d dhchap status %x step %x\n", __func__,
+		 ctrl->cntlid, req->sq->qid,
+		 req->sq->dhchap_status, req->sq->dhchap_step);
+	if (status)
+		pr_debug("%s: ctrl %d qid %d nvme status %x error loc %d\n",
+			 __func__, ctrl->cntlid, req->sq->qid,
+			 status, req->error_loc);
+	req->cqe->result.u64 = 0;
+	nvmet_req_complete(req, status);
+	if (req->sq->dhchap_step != NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2 &&
+	    req->sq->dhchap_step != NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) {
+		unsigned long auth_expire_secs = ctrl->kato ? ctrl->kato : 120;
+
+		mod_delayed_work(system_wq, &req->sq->auth_expired_work,
+				 auth_expire_secs * HZ);
+		return;
+	}
+	/* Final states, clear up variables */
+	nvmet_auth_sq_free(req->sq);
+	if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE2)
+		nvmet_ctrl_fatal_error(ctrl);
+}
+
+static int nvmet_auth_challenge(struct nvmet_req *req, void *d, int al)
+{
+	struct nvmf_auth_dhchap_challenge_data *data = d;
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	int ret = 0;
+	int hash_len = nvme_auth_hmac_hash_len(ctrl->shash_id);
+	int data_size = sizeof(*d) + hash_len;
+
+	if (ctrl->dh_tfm)
+		data_size += ctrl->dh_keysize;
+	if (al < data_size) {
+		pr_debug("%s: buffer too small (al %d need %d)\n", __func__,
+			 al, data_size);
+		return -EINVAL;
+	}
+	memset(data, 0, data_size);
+	req->sq->dhchap_s1 = nvme_auth_get_seqnum();
+	data->auth_type = NVME_AUTH_DHCHAP_MESSAGES;
+	data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE;
+	data->t_id = cpu_to_le16(req->sq->dhchap_tid);
+	data->hashid = ctrl->shash_id;
+	data->hl = hash_len;
+	data->seqnum = cpu_to_le32(req->sq->dhchap_s1);
+	req->sq->dhchap_c1 = kmalloc(data->hl, GFP_KERNEL);
+	if (!req->sq->dhchap_c1)
+		return -ENOMEM;
+	get_random_bytes(req->sq->dhchap_c1, data->hl);
+	memcpy(data->cval, req->sq->dhchap_c1, data->hl);
+	if (ctrl->dh_tfm) {
+		data->dhgid = ctrl->dh_gid;
+		data->dhvlen = cpu_to_le16(ctrl->dh_keysize);
+		ret = nvmet_auth_ctrl_exponential(req, data->cval + data->hl,
+						  ctrl->dh_keysize);
+	}
+	pr_debug("%s: ctrl %d qid %d seq %d transaction %d hl %d dhvlen %zu\n",
+		 __func__, ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1,
+		 req->sq->dhchap_tid, data->hl, ctrl->dh_keysize);
+	return ret;
+}
+
+static int nvmet_auth_success1(struct nvmet_req *req, void *d, int al)
+{
+	struct nvmf_auth_dhchap_success1_data *data = d;
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	int hash_len = nvme_auth_hmac_hash_len(ctrl->shash_id);
+
+	WARN_ON(al < sizeof(*data));
+	memset(data, 0, sizeof(*data));
+	data->auth_type = NVME_AUTH_DHCHAP_MESSAGES;
+	data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1;
+	data->t_id = cpu_to_le16(req->sq->dhchap_tid);
+	data->hl = hash_len;
+	if (req->sq->dhchap_c2) {
+		if (!ctrl->ctrl_key) {
+			pr_warn("ctrl %d qid %d no ctrl key\n",
+				ctrl->cntlid, req->sq->qid);
+			return NVME_AUTH_DHCHAP_FAILURE_FAILED;
+		}
+		if (nvmet_auth_ctrl_hash(req, data->rval, data->hl))
+			return NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
+		data->rvalid = 1;
+		pr_debug("ctrl %d qid %d response %*ph\n",
+			 ctrl->cntlid, req->sq->qid, data->hl, data->rval);
+	}
+	return 0;
+}
+
+static void nvmet_auth_failure1(struct nvmet_req *req, void *d, int al)
+{
+	struct nvmf_auth_dhchap_failure_data *data = d;
+
+	WARN_ON(al < sizeof(*data));
+	data->auth_type = NVME_AUTH_COMMON_MESSAGES;
+	data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_FAILURE1;
+	data->t_id = cpu_to_le16(req->sq->dhchap_tid);
+	data->rescode = NVME_AUTH_DHCHAP_FAILURE_REASON_FAILED;
+	data->rescode_exp = req->sq->dhchap_status;
+}
+
+void nvmet_execute_auth_receive(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	void *d;
+	u32 al;
+	u16 status = 0;
+
+	if (req->cmd->auth_receive.secp != NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER) {
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		req->error_loc =
+			offsetof(struct nvmf_auth_receive_command, secp);
+		goto done;
+	}
+	if (req->cmd->auth_receive.spsp0 != 0x01) {
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		req->error_loc =
+			offsetof(struct nvmf_auth_receive_command, spsp0);
+		goto done;
+	}
+	if (req->cmd->auth_receive.spsp1 != 0x01) {
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		req->error_loc =
+			offsetof(struct nvmf_auth_receive_command, spsp1);
+		goto done;
+	}
+	al = le32_to_cpu(req->cmd->auth_receive.al);
+	if (!al) {
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		req->error_loc =
+			offsetof(struct nvmf_auth_receive_command, al);
+		goto done;
+	}
+	if (!nvmet_check_transfer_len(req, al)) {
+		pr_debug("%s: transfer length mismatch (%u)\n", __func__, al);
+		return;
+	}
+
+	d = kmalloc(al, GFP_KERNEL);
+	if (!d) {
+		status = NVME_SC_INTERNAL;
+		goto done;
+	}
+	pr_debug("%s: ctrl %d qid %d step %x\n", __func__,
+		 ctrl->cntlid, req->sq->qid, req->sq->dhchap_step);
+	switch (req->sq->dhchap_step) {
+	case NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE:
+		if (nvmet_auth_challenge(req, d, al) < 0) {
+			pr_warn("ctrl %d qid %d: challenge error (%d)\n",
+				ctrl->cntlid, req->sq->qid, status);
+			status = NVME_SC_INTERNAL;
+			break;
+		}
+		if (status) {
+			req->sq->dhchap_status = status;
+			nvmet_auth_failure1(req, d, al);
+			pr_warn("ctrl %d qid %d: challenge status (%x)\n",
+				ctrl->cntlid, req->sq->qid,
+				req->sq->dhchap_status);
+			status = 0;
+			break;
+		}
+		req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_REPLY;
+		break;
+	case NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1:
+		status = nvmet_auth_success1(req, d, al);
+		if (status) {
+			req->sq->dhchap_status = status;
+			req->sq->authenticated = false;
+			nvmet_auth_failure1(req, d, al);
+			pr_warn("ctrl %d qid %d: success1 status (%x)\n",
+				ctrl->cntlid, req->sq->qid,
+				req->sq->dhchap_status);
+			break;
+		}
+		req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2;
+		break;
+	case NVME_AUTH_DHCHAP_MESSAGE_FAILURE1:
+		req->sq->authenticated = false;
+		nvmet_auth_failure1(req, d, al);
+		pr_warn("ctrl %d qid %d failure1 (%x)\n",
+			ctrl->cntlid, req->sq->qid, req->sq->dhchap_status);
+		break;
+	default:
+		pr_warn("ctrl %d qid %d unhandled step (%d)\n",
+			ctrl->cntlid, req->sq->qid, req->sq->dhchap_step);
+		req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_FAILURE1;
+		req->sq->dhchap_status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+		nvmet_auth_failure1(req, d, al);
+		status = 0;
+		break;
+	}
+
+	status = nvmet_copy_to_sgl(req, 0, d, al);
+	kfree(d);
+done:
+	req->cqe->result.u64 = 0;
+	nvmet_req_complete(req, status);
+	if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2)
+		nvmet_auth_sq_free(req->sq);
+	else if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) {
+		nvmet_auth_sq_free(req->sq);
+		nvmet_ctrl_fatal_error(ctrl);
+	}
+}
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index 70fb587e9413..f91a56180d3d 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -82,7 +82,7 @@ static void nvmet_execute_prop_get(struct nvmet_req *req)
 	nvmet_req_complete(req, status);
 }
 
-u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req)
+u16 nvmet_parse_fabrics_admin_cmd(struct nvmet_req *req)
 {
 	struct nvme_command *cmd = req->cmd;
 
@@ -93,6 +93,37 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req)
 	case nvme_fabrics_type_property_get:
 		req->execute = nvmet_execute_prop_get;
 		break;
+#ifdef CONFIG_NVME_TARGET_AUTH
+	case nvme_fabrics_type_auth_send:
+		req->execute = nvmet_execute_auth_send;
+		break;
+	case nvme_fabrics_type_auth_receive:
+		req->execute = nvmet_execute_auth_receive;
+		break;
+#endif
+	default:
+		pr_debug("received unknown capsule type 0x%x\n",
+			cmd->fabrics.fctype);
+		req->error_loc = offsetof(struct nvmf_common_command, fctype);
+		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+	}
+
+	return 0;
+}
+
+u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req)
+{
+	struct nvme_command *cmd = req->cmd;
+
+	switch (cmd->fabrics.fctype) {
+#ifdef CONFIG_NVME_TARGET_AUTH
+	case nvme_fabrics_type_auth_send:
+		req->execute = nvmet_execute_auth_send;
+		break;
+	case nvme_fabrics_type_auth_receive:
+		req->execute = nvmet_execute_auth_receive;
+		break;
+#endif
 	default:
 		pr_debug("received unknown capsule type 0x%x\n",
 			cmd->fabrics.fctype);
@@ -173,6 +204,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
 	struct nvmf_connect_data *d;
 	struct nvmet_ctrl *ctrl = NULL;
 	u16 status = 0;
+	int ret;
 
 	if (!nvmet_check_transfer_len(req, sizeof(struct nvmf_connect_data)))
 		return;
@@ -215,18 +247,32 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
 
 	uuid_copy(&ctrl->hostid, &d->hostid);
 
+	ret = nvmet_setup_auth(ctrl);
+	if (ret < 0) {
+		pr_err("Failed to setup authentication, error %d\n", ret);
+		nvmet_ctrl_put(ctrl);
+		if (ret == -EPERM)
+			status = (NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR);
+		else
+			status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
 	status = nvmet_install_queue(ctrl, req);
 	if (status) {
 		nvmet_ctrl_put(ctrl);
 		goto out;
 	}
 
-	pr_info("creating %s controller %d for subsystem %s for NQN %s%s.\n",
+	pr_info("creating %s controller %d for subsystem %s for NQN %s%s%s.\n",
 		nvmet_is_disc_subsys(ctrl->subsys) ? "discovery" : "nvm",
 		ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn,
-		ctrl->pi_support ? " T10-PI is enabled" : "");
+		ctrl->pi_support ? " T10-PI is enabled" : "",
+		nvmet_has_auth(ctrl) ? " with DH-HMAC-CHAP" : "");
 	req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid);
 
+	if (nvmet_has_auth(ctrl))
+		nvmet_init_auth(ctrl, req);
 out:
 	kfree(d);
 complete:
@@ -286,6 +332,9 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
 	req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid);
 
 	pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
+	req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid);
+	if (nvmet_has_auth(ctrl))
+		nvmet_init_auth(ctrl, req);
 
 out:
 	kfree(d);
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 0f5c77e22a0a..9750a7fca268 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -424,9 +424,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
 {
 	if (ctrl->ctrl.queue_count > 1) {
 		nvme_stop_queues(&ctrl->ctrl);
-		blk_mq_tagset_busy_iter(&ctrl->tag_set,
-					nvme_cancel_request, &ctrl->ctrl);
-		blk_mq_tagset_wait_completed_request(&ctrl->tag_set);
+		nvme_cancel_tagset(&ctrl->ctrl);
 		nvme_loop_destroy_io_queues(ctrl);
 	}
 
@@ -434,9 +432,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
 	if (ctrl->ctrl.state == NVME_CTRL_LIVE)
 		nvme_shutdown_ctrl(&ctrl->ctrl);
 
-	blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
-				nvme_cancel_request, &ctrl->ctrl);
-	blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set);
+	nvme_cancel_admin_tagset(&ctrl->ctrl);
 	nvme_loop_destroy_admin_queue(ctrl);
 }
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 2b3e5719f24e..6ffeeb0a1c49 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -108,6 +108,19 @@ struct nvmet_sq {
 	u16			size;
 	u32			sqhd;
 	bool			sqhd_disabled;
+#ifdef CONFIG_NVME_TARGET_AUTH
+	struct delayed_work	auth_expired_work;
+	bool			authenticated;
+	u16			dhchap_tid;
+	u16			dhchap_status;
+	int			dhchap_step;
+	u8			*dhchap_c1;
+	u8			*dhchap_c2;
+	u32			dhchap_s1;
+	u32			dhchap_s2;
+	u8			*dhchap_skey;
+	int			dhchap_skey_len;
+#endif
 	struct completion	free_done;
 	struct completion	confirm_done;
 };
@@ -209,6 +222,15 @@ struct nvmet_ctrl {
 	u64			err_counter;
 	struct nvme_error_slot	slots[NVMET_ERROR_LOG_SLOTS];
 	bool			pi_support;
+#ifdef CONFIG_NVME_TARGET_AUTH
+	struct nvme_dhchap_key	*host_key;
+	struct nvme_dhchap_key	*ctrl_key;
+	u8			shash_id;
+	struct crypto_kpp	*dh_tfm;
+	u8			dh_gid;
+	u8			*dh_key;
+	size_t			dh_keysize;
+#endif
 };
 
 struct nvmet_subsys {
@@ -271,6 +293,12 @@ static inline struct nvmet_subsys *namespaces_to_subsys(
 
 struct nvmet_host {
 	struct config_group	group;
+	u8			*dhchap_secret;
+	u8			*dhchap_ctrl_secret;
+	u8			dhchap_key_hash;
+	u8			dhchap_ctrl_key_hash;
+	u8			dhchap_hash_id;
+	u8			dhchap_dhgroup_id;
 };
 
 static inline struct nvmet_host *to_host(struct config_item *item)
@@ -420,7 +448,8 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req);
 u16 nvmet_bdev_zns_parse_io_cmd(struct nvmet_req *req);
 u16 nvmet_parse_admin_cmd(struct nvmet_req *req);
 u16 nvmet_parse_discovery_cmd(struct nvmet_req *req);
-u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req);
+u16 nvmet_parse_fabrics_admin_cmd(struct nvmet_req *req);
+u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req);
 
 bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 		struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops);
@@ -668,4 +697,48 @@ static inline void nvmet_req_bio_put(struct nvmet_req *req, struct bio *bio)
 		bio_put(bio);
 }
 
+#ifdef CONFIG_NVME_TARGET_AUTH
+void nvmet_execute_auth_send(struct nvmet_req *req);
+void nvmet_execute_auth_receive(struct nvmet_req *req);
+int nvmet_auth_set_key(struct nvmet_host *host, const char *secret,
+		       bool set_ctrl);
+int nvmet_auth_set_host_hash(struct nvmet_host *host, const char *hash);
+int nvmet_setup_auth(struct nvmet_ctrl *ctrl);
+void nvmet_init_auth(struct nvmet_ctrl *ctrl, struct nvmet_req *req);
+void nvmet_destroy_auth(struct nvmet_ctrl *ctrl);
+void nvmet_auth_sq_free(struct nvmet_sq *sq);
+int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id);
+bool nvmet_check_auth_status(struct nvmet_req *req);
+int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
+			 unsigned int hash_len);
+int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response,
+			 unsigned int hash_len);
+static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl)
+{
+	return ctrl->host_key != NULL;
+}
+int nvmet_auth_ctrl_exponential(struct nvmet_req *req,
+				u8 *buf, int buf_size);
+int nvmet_auth_ctrl_sesskey(struct nvmet_req *req,
+			    u8 *buf, int buf_size);
+#else
+static inline int nvmet_setup_auth(struct nvmet_ctrl *ctrl)
+{
+	return 0;
+}
+static inline void nvmet_init_auth(struct nvmet_ctrl *ctrl,
+				   struct nvmet_req *req) {};
+static inline void nvmet_destroy_auth(struct nvmet_ctrl *ctrl) {};
+static inline void nvmet_auth_sq_free(struct nvmet_sq *sq) {};
+static inline bool nvmet_check_auth_status(struct nvmet_req *req)
+{
+	return true;
+}
+static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl)
+{
+	return false;
+}
+static inline const char *nvmet_dhchap_dhgroup_name(u8 dhgid) { return NULL; }
+#endif
+
 #endif /* _NVMET_H */
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 0a9542599ad1..dc3b4dc8fe08 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1839,7 +1839,8 @@ static int __init nvmet_tcp_init(void)
 {
 	int ret;
 
-	nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0);
+	nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq",
+				WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
 	if (!nvmet_tcp_wq)
 		return -ENOMEM;
 
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 4df8bf6505fc..ea82821599f6 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1725,7 +1725,7 @@ void dasd_int_handler(struct ccw_device *cdev, unsigned long intparm,
 		dasd_put_device(device);
 	}
 
-	/* check for for attention message */
+	/* check for attention message */
 	if (scsw_dstat(&irb->scsw) & DEV_STAT_ATTENTION) {
 		device = dasd_device_from_cdev_locked(cdev);
 		if (!IS_ERR(device)) {
diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c
index e9edf3b6ed7c..94ee59864971 100644
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -639,6 +639,7 @@ static void dasd_diag_setup_blk_queue(struct dasd_block *block)
 	/* With page sized segments each segment can be translated into one idaw/tidaw */
 	blk_queue_max_segment_size(q, PAGE_SIZE);
 	blk_queue_segment_boundary(q, PAGE_SIZE - 1);
+	blk_queue_dma_alignment(q, PAGE_SIZE - 1);
 }
 
 static int dasd_diag_pe_handler(struct dasd_device *device,
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index 836838f7d686..3cc93e2e4e15 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -6626,6 +6626,7 @@ static void dasd_eckd_setup_blk_queue(struct dasd_block *block)
 	/* With page sized segments each segment can be translated into one idaw/tidaw */
 	blk_queue_max_segment_size(q, PAGE_SIZE);
 	blk_queue_segment_boundary(q, PAGE_SIZE - 1);
+	blk_queue_dma_alignment(q, PAGE_SIZE - 1);
 }
 
 static struct ccw_driver dasd_eckd_driver = {
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 4d8d1759775a..5187705bd0f3 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -863,7 +863,7 @@ dcssblk_submit_bio(struct bio *bio)
 	unsigned long source_addr;
 	unsigned long bytes_done;
 
-	blk_queue_split(&bio);
+	bio = bio_split_to_limits(bio);
 
 	bytes_done = 0;
 	dev_info = bio->bi_bdev->bd_disk->private_data;
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 841fa6d9d744..641abfa4b718 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -517,36 +517,36 @@ bad_block:
 	/* Here we know that we can set the new attribute. */
 
 	if (header) {
-		/* assert(header == HDR(bh)); */
+		int offset;
+
 		lock_buffer(bh);
 		if (header->h_refcount == cpu_to_le32(1)) {
 			__u32 hash = le32_to_cpu(header->h_hash);
+			struct mb_cache_entry *oe;
 
-			ea_bdebug(bh, "modifying in-place");
+			oe = mb_cache_entry_delete_or_get(EA_BLOCK_CACHE(inode),
+					hash, bh->b_blocknr);
+			if (!oe) {
+				ea_bdebug(bh, "modifying in-place");
+				goto update_block;
+			}
 			/*
-			 * This must happen under buffer lock for
-			 * ext2_xattr_set2() to reliably detect modified block
+			 * Someone is trying to reuse the block, leave it alone
 			 */
-			mb_cache_entry_delete(EA_BLOCK_CACHE(inode), hash,
-					      bh->b_blocknr);
-
-			/* keep the buffer locked while modifying it. */
-		} else {
-			int offset;
-
-			unlock_buffer(bh);
-			ea_bdebug(bh, "cloning");
-			header = kmemdup(HDR(bh), bh->b_size, GFP_KERNEL);
-			error = -ENOMEM;
-			if (header == NULL)
-				goto cleanup;
-			header->h_refcount = cpu_to_le32(1);
-
-			offset = (char *)here - bh->b_data;
-			here = ENTRY((char *)header + offset);
-			offset = (char *)last - bh->b_data;
-			last = ENTRY((char *)header + offset);
+			mb_cache_entry_put(EA_BLOCK_CACHE(inode), oe);
 		}
+		unlock_buffer(bh);
+		ea_bdebug(bh, "cloning");
+		header = kmemdup(HDR(bh), bh->b_size, GFP_KERNEL);
+		error = -ENOMEM;
+		if (header == NULL)
+			goto cleanup;
+		header->h_refcount = cpu_to_le32(1);
+
+		offset = (char *)here - bh->b_data;
+		here = ENTRY((char *)header + offset);
+		offset = (char *)last - bh->b_data;
+		last = ENTRY((char *)header + offset);
 	} else {
 		/* Allocate a buffer where we construct the new block. */
 		header = kzalloc(sb->s_blocksize, GFP_KERNEL);
@@ -559,6 +559,7 @@ bad_block:
 		last = here = ENTRY(header+1);
 	}
 
+update_block:
 	/* Iff we are modifying the block in-place, bh is locked here. */
 
 	if (not_found) {
@@ -651,6 +652,55 @@ cleanup:
 	return error;
 }
 
+static void ext2_xattr_release_block(struct inode *inode,
+				     struct buffer_head *bh)
+{
+	struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
+
+retry_ref:
+	lock_buffer(bh);
+	if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
+		__u32 hash = le32_to_cpu(HDR(bh)->h_hash);
+		struct mb_cache_entry *oe;
+
+		/*
+		 * This must happen under buffer lock to properly
+		 * serialize with ext2_xattr_set() reusing the block.
+		 */
+		oe = mb_cache_entry_delete_or_get(ea_block_cache, hash,
+						  bh->b_blocknr);
+		if (oe) {
+			/*
+			 * Someone is trying to reuse the block. Wait
+			 * and retry.
+			 */
+			unlock_buffer(bh);
+			mb_cache_entry_wait_unused(oe);
+			mb_cache_entry_put(ea_block_cache, oe);
+			goto retry_ref;
+		}
+
+		/* Free the old block. */
+		ea_bdebug(bh, "freeing");
+		ext2_free_blocks(inode, bh->b_blocknr, 1);
+		/* We let our caller release bh, so we
+		 * need to duplicate the buffer before. */
+		get_bh(bh);
+		bforget(bh);
+		unlock_buffer(bh);
+	} else {
+		/* Decrement the refcount only. */
+		le32_add_cpu(&HDR(bh)->h_refcount, -1);
+		dquot_free_block(inode, 1);
+		mark_buffer_dirty(bh);
+		unlock_buffer(bh);
+		ea_bdebug(bh, "refcount now=%d",
+			le32_to_cpu(HDR(bh)->h_refcount));
+		if (IS_SYNC(inode))
+			sync_dirty_buffer(bh);
+	}
+}
+
 /*
  * Second half of ext2_xattr_set(): Update the file system.
  */
@@ -747,34 +797,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 		 * If there was an old block and we are no longer using it,
 		 * release the old block.
 		 */
-		lock_buffer(old_bh);
-		if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
-			__u32 hash = le32_to_cpu(HDR(old_bh)->h_hash);
-
-			/*
-			 * This must happen under buffer lock for
-			 * ext2_xattr_set2() to reliably detect freed block
-			 */
-			mb_cache_entry_delete(ea_block_cache, hash,
-					      old_bh->b_blocknr);
-			/* Free the old block. */
-			ea_bdebug(old_bh, "freeing");
-			ext2_free_blocks(inode, old_bh->b_blocknr, 1);
-			mark_inode_dirty(inode);
-			/* We let our caller release old_bh, so we
-			 * need to duplicate the buffer before. */
-			get_bh(old_bh);
-			bforget(old_bh);
-		} else {
-			/* Decrement the refcount only. */
-			le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
-			dquot_free_block_nodirty(inode, 1);
-			mark_inode_dirty(inode);
-			mark_buffer_dirty(old_bh);
-			ea_bdebug(old_bh, "refcount now=%d",
-				le32_to_cpu(HDR(old_bh)->h_refcount));
-		}
-		unlock_buffer(old_bh);
+		ext2_xattr_release_block(inode, old_bh);
 	}
 
 cleanup:
@@ -828,30 +851,7 @@ ext2_xattr_delete_inode(struct inode *inode)
 			EXT2_I(inode)->i_file_acl);
 		goto cleanup;
 	}
-	lock_buffer(bh);
-	if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
-		__u32 hash = le32_to_cpu(HDR(bh)->h_hash);
-
-		/*
-		 * This must happen under buffer lock for ext2_xattr_set2() to
-		 * reliably detect freed block
-		 */
-		mb_cache_entry_delete(EA_BLOCK_CACHE(inode), hash,
-				      bh->b_blocknr);
-		ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
-		get_bh(bh);
-		bforget(bh);
-		unlock_buffer(bh);
-	} else {
-		le32_add_cpu(&HDR(bh)->h_refcount, -1);
-		ea_bdebug(bh, "refcount now=%d",
-			le32_to_cpu(HDR(bh)->h_refcount));
-		unlock_buffer(bh);
-		mark_buffer_dirty(bh);
-		if (IS_SYNC(inode))
-			sync_dirty_buffer(bh);
-		dquot_free_block_nodirty(inode, 1);
-	}
+	ext2_xattr_release_block(inode, bh);
 	EXT2_I(inode)->i_file_acl = 0;
 
 cleanup:
@@ -943,7 +943,7 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
 	if (!header->h_hash)
 		return NULL;  /* never share */
 	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
-again:
+
 	ce = mb_cache_entry_find_first(ea_block_cache, hash);
 	while (ce) {
 		struct buffer_head *bh;
@@ -955,22 +955,8 @@ again:
 				inode->i_ino, (unsigned long) ce->e_value);
 		} else {
 			lock_buffer(bh);
-			/*
-			 * We have to be careful about races with freeing or
-			 * rehashing of xattr block. Once we hold buffer lock
-			 * xattr block's state is stable so we can check
-			 * whether the block got freed / rehashed or not.
-			 * Since we unhash mbcache entry under buffer lock when
-			 * freeing / rehashing xattr block, checking whether
-			 * entry is still hashed is reliable.
-			 */
-			if (hlist_bl_unhashed(&ce->e_hash_list)) {
-				mb_cache_entry_put(ea_block_cache, ce);
-				unlock_buffer(bh);
-				brelse(bh);
-				goto again;
-			} else if (le32_to_cpu(HDR(bh)->h_refcount) >
-				   EXT2_XATTR_REFCOUNT_MAX) {
+			if (le32_to_cpu(HDR(bh)->h_refcount) >
+			    EXT2_XATTR_REFCOUNT_MAX) {
 				ea_idebug(inode, "block %ld refcount %d>%d",
 					  (unsigned long) ce->e_value,
 					  le32_to_cpu(HDR(bh)->h_refcount),
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 78ee3ef795ae..8ff4b9192a9f 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -666,7 +666,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 	 * it's possible we've just missed a transaction commit here,
 	 * so ignore the returned status
 	 */
-	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
+	ext4_debug("%s: retrying operation after ENOSPC\n", sb->s_id);
 	(void) jbd2_journal_force_commit_nested(sbi->s_journal);
 	return 1;
 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 29fc575a4eb6..9bca5565547b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -724,6 +724,8 @@ enum {
 #define EXT4_IOC_GETSTATE		_IOW('f', 41, __u32)
 #define EXT4_IOC_GET_ES_CACHE		_IOWR('f', 42, struct fiemap)
 #define EXT4_IOC_CHECKPOINT		_IOW('f', 43, __u32)
+#define EXT4_IOC_GETFSUUID		_IOR('f', 44, struct fsuuid)
+#define EXT4_IOC_SETFSUUID		_IOW('f', 44, struct fsuuid)
 
 #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
 
@@ -753,6 +755,15 @@ enum {
 						EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT | \
 						EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
 
+/*
+ * Structure for EXT4_IOC_GETFSUUID/EXT4_IOC_SETFSUUID
+ */
+struct fsuuid {
+	__u32       fsu_len;
+	__u32       fsu_flags;
+	__u8        fsu_uuid[];
+};
+
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
  * ioctl commands in 32 bit emulation
@@ -3016,7 +3027,7 @@ int ext4_fileattr_set(struct user_namespace *mnt_userns,
 		      struct dentry *dentry, struct fileattr *fa);
 int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa);
 extern void ext4_reset_inode_seed(struct inode *inode);
-int ext4_update_overhead(struct super_block *sb);
+int ext4_update_overhead(struct super_block *sb, bool force);
 
 /* migrate.c */
 extern int ext4_ext_migrate(struct inode *);
@@ -3583,6 +3594,7 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
 extern int ext4_inline_data_fiemap(struct inode *inode,
 				   struct fiemap_extent_info *fieinfo,
 				   int *has_inline, __u64 start, __u64 len);
+extern void *ext4_read_inline_link(struct inode *inode);
 
 struct iomap;
 extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap);
@@ -3799,7 +3811,7 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
 extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
 
 extern int ext4_resize_begin(struct super_block *sb);
-extern void ext4_resize_end(struct super_block *sb);
+extern int ext4_resize_end(struct super_block *sb, bool update_backups);
 
 static inline void ext4_set_io_unwritten_flag(struct inode *inode,
 					      struct ext4_io_end *io_end)
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 3477a16d08ae..8e1fb18f465e 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -267,8 +267,7 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
 	trace_ext4_forget(inode, is_metadata, blocknr);
 	BUFFER_TRACE(bh, "enter");
 
-	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
-		  "data mode %x\n",
+	ext4_debug("forgetting bh %p: is_metadata=%d, mode %o, data mode %x\n",
 		  bh, is_metadata, inode->i_mode,
 		  test_opt(inode->i_sb, DATA_FLAGS));
 
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index eb4c8ad1bb61..2af962cbb835 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -917,8 +917,8 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 	mutex_unlock(&ei->i_fc_lock);
 
 	cur_lblk_off = old_blk_size;
-	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
-		  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
+	ext4_debug("will try writing %d to %d for inode %ld\n",
+		   cur_lblk_off, new_blk_size, inode->i_ino);
 
 	while (cur_lblk_off <= new_blk_size) {
 		map.m_lblk = cur_lblk_off;
@@ -1168,7 +1168,7 @@ static void ext4_fc_update_stats(struct super_block *sb, int status,
 {
 	struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
 
-	jbd_debug(1, "Fast commit ended with status = %d for tid %u",
+	ext4_debug("Fast commit ended with status = %d for tid %u",
 			status, commit_tid);
 	if (status == EXT4_FC_STATUS_OK) {
 		stats->fc_num_commits++;
@@ -1375,14 +1375,14 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
 
 	if (IS_ERR(inode)) {
-		jbd_debug(1, "Inode %d not found", darg.ino);
+		ext4_debug("Inode %d not found", darg.ino);
 		return 0;
 	}
 
 	old_parent = ext4_iget(sb, darg.parent_ino,
 				EXT4_IGET_NORMAL);
 	if (IS_ERR(old_parent)) {
-		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
+		ext4_debug("Dir with inode %d not found", darg.parent_ino);
 		iput(inode);
 		return 0;
 	}
@@ -1407,21 +1407,21 @@ static int ext4_fc_replay_link_internal(struct super_block *sb,
 
 	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
 	if (IS_ERR(dir)) {
-		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
+		ext4_debug("Dir with inode %d not found.", darg->parent_ino);
 		dir = NULL;
 		goto out;
 	}
 
 	dentry_dir = d_obtain_alias(dir);
 	if (IS_ERR(dentry_dir)) {
-		jbd_debug(1, "Failed to obtain dentry");
+		ext4_debug("Failed to obtain dentry");
 		dentry_dir = NULL;
 		goto out;
 	}
 
 	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
 	if (!dentry_inode) {
-		jbd_debug(1, "Inode dentry not created.");
+		ext4_debug("Inode dentry not created.");
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -1434,7 +1434,7 @@ static int ext4_fc_replay_link_internal(struct super_block *sb,
 	 * could complete.
 	 */
 	if (ret && ret != -EEXIST) {
-		jbd_debug(1, "Failed to link\n");
+		ext4_debug("Failed to link\n");
 		goto out;
 	}
 
@@ -1468,7 +1468,7 @@ static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
 
 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
 	if (IS_ERR(inode)) {
-		jbd_debug(1, "Inode not found.");
+		ext4_debug("Inode not found.");
 		return 0;
 	}
 
@@ -1576,7 +1576,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
 	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
 	if (IS_ERR(inode)) {
-		jbd_debug(1, "Inode not found.");
+		ext4_debug("Inode not found.");
 		return -EFSCORRUPTED;
 	}
 
@@ -1630,7 +1630,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
 
 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
 	if (IS_ERR(inode)) {
-		jbd_debug(1, "inode %d not found.", darg.ino);
+		ext4_debug("inode %d not found.", darg.ino);
 		inode = NULL;
 		ret = -EINVAL;
 		goto out;
@@ -1643,7 +1643,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
 		 */
 		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
 		if (IS_ERR(dir)) {
-			jbd_debug(1, "Dir %d not found.", darg.ino);
+			ext4_debug("Dir %d not found.", darg.ino);
 			goto out;
 		}
 		ret = ext4_init_new_dir(NULL, dir, inode);
@@ -1727,7 +1727,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
 
 	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
 	if (IS_ERR(inode)) {
-		jbd_debug(1, "Inode not found.");
+		ext4_debug("Inode not found.");
 		return 0;
 	}
 
@@ -1741,7 +1741,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
 
 	cur = start;
 	remaining = len;
-	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
+	ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
 		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
 		  inode->i_ino);
 
@@ -1802,7 +1802,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
 		}
 
 		/* Range is mapped and needs a state change */
-		jbd_debug(1, "Converting from %ld to %d %lld",
+		ext4_debug("Converting from %ld to %d %lld",
 				map.m_flags & EXT4_MAP_UNWRITTEN,
 			ext4_ext_is_unwritten(ex), map.m_pblk);
 		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
@@ -1845,7 +1845,7 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
 
 	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
 	if (IS_ERR(inode)) {
-		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
+		ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
 		return 0;
 	}
 
@@ -1853,7 +1853,7 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
 	if (ret)
 		goto out;
 
-	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
+	ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
 			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
 			le32_to_cpu(lrange.fc_len));
 	while (remaining > 0) {
@@ -1902,7 +1902,7 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
 		inode = ext4_iget(sb, state->fc_modified_inodes[i],
 			EXT4_IGET_NORMAL);
 		if (IS_ERR(inode)) {
-			jbd_debug(1, "Inode %d not found.",
+			ext4_debug("Inode %d not found.",
 				state->fc_modified_inodes[i]);
 			continue;
 		}
@@ -2031,7 +2031,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
 	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
 		memcpy(&tl, cur, sizeof(tl));
 		val = cur + sizeof(tl);
-		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
+		ext4_debug("Scan phase, tag:%s, blk %lld\n",
 			  tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
 		switch (le16_to_cpu(tl.fc_tag)) {
 		case EXT4_FC_TAG_ADD_RANGE:
@@ -2126,7 +2126,7 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
 		sbi->s_mount_state |= EXT4_FC_REPLAY;
 	}
 	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
-		jbd_debug(1, "Replay stops\n");
+		ext4_debug("Replay stops\n");
 		ext4_fc_set_bitmaps_and_counters(sb);
 		return 0;
 	}
@@ -2150,7 +2150,7 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
 			ext4_fc_set_bitmaps_and_counters(sb);
 			break;
 		}
-		jbd_debug(3, "Replay phase, tag:%s\n",
+		ext4_debug("Replay phase, tag:%s\n",
 				tag2str(le16_to_cpu(tl.fc_tag)));
 		state->fc_replay_num_tags--;
 		switch (le16_to_cpu(tl.fc_tag)) {
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 07a8c75b65ed..860fc5119009 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -460,7 +460,7 @@ static int ext4_splice_branch(handle_t *handle,
 		 * the new i_size.  But that is not done here - it is done in
 		 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
 		 */
-		jbd_debug(5, "splicing indirect only\n");
+		ext4_debug("splicing indirect only\n");
 		BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
 		err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh);
 		if (err)
@@ -472,7 +472,7 @@ static int ext4_splice_branch(handle_t *handle,
 		err = ext4_mark_inode_dirty(handle, ar->inode);
 		if (unlikely(err))
 			goto err_out;
-		jbd_debug(5, "splicing direct\n");
+		ext4_debug("splicing direct\n");
 	}
 	return err;
 
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index cff52ff6549d..a4fbe825694b 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -6,6 +6,7 @@
 
 #include <linux/iomap.h>
 #include <linux/fiemap.h>
+#include <linux/namei.h>
 #include <linux/iversion.h>
 #include <linux/sched/mm.h>
 
@@ -35,6 +36,9 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
 	struct ext4_inode *raw_inode;
 	int free, min_offs;
 
+	if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
+		return 0;
+
 	min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
 			EXT4_GOOD_OLD_INODE_SIZE -
 			EXT4_I(inode)->i_extra_isize -
@@ -1588,6 +1592,35 @@ out:
 	return ret;
 }
 
+void *ext4_read_inline_link(struct inode *inode)
+{
+	struct ext4_iloc iloc;
+	int ret, inline_size;
+	void *link;
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret)
+		return ERR_PTR(ret);
+
+	ret = -ENOMEM;
+	inline_size = ext4_get_inline_size(inode);
+	link = kmalloc(inline_size + 1, GFP_NOFS);
+	if (!link)
+		goto out;
+
+	ret = ext4_read_inline_data(inode, link, inline_size, &iloc);
+	if (ret < 0) {
+		kfree(link);
+		goto out;
+	}
+	nd_terminate_link(link, inode->i_size, ret);
+out:
+	if (ret < 0)
+		link = ERR_PTR(ret);
+	brelse(iloc.bh);
+	return link;
+}
+
 struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
 					struct ext4_dir_entry_2 **parent_de,
 					int *retval)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9fd60fc8ba4c..601214453c3a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -177,6 +177,8 @@ void ext4_evict_inode(struct inode *inode)
 
 	trace_ext4_evict_inode(inode);
 
+	if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
+		ext4_evict_ea_inode(inode);
 	if (inode->i_nlink) {
 		/*
 		 * When journalling data dirty buffers are tracked only in the
@@ -1571,7 +1573,14 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
 		ext4_lblk_t start, last;
 		start = index << (PAGE_SHIFT - inode->i_blkbits);
 		last = end << (PAGE_SHIFT - inode->i_blkbits);
+
+		/*
+		 * avoid racing with extent status tree scans made by
+		 * ext4_insert_delayed_block()
+		 */
+		down_write(&EXT4_I(inode)->i_data_sem);
 		ext4_es_remove_extent(inode, start, last - start + 1);
+		up_write(&EXT4_I(inode)->i_data_sem);
 	}
 
 	folio_batch_init(&fbatch);
@@ -3142,13 +3151,15 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 {
 	struct inode *inode = mapping->host;
 	journal_t *journal;
+	sector_t ret = 0;
 	int err;
 
+	inode_lock_shared(inode);
 	/*
 	 * We can get here for an inline file via the FIBMAP ioctl
 	 */
 	if (ext4_has_inline_data(inode))
-		return 0;
+		goto out;
 
 	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
 			test_opt(inode->i_sb, DELALLOC)) {
@@ -3187,10 +3198,14 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 		jbd2_journal_unlock_updates(journal);
 
 		if (err)
-			return 0;
+			goto out;
 	}
 
-	return iomap_bmap(mapping, block, &ext4_iomap_ops);
+	ret = iomap_bmap(mapping, block, &ext4_iomap_ops);
+
+out:
+	inode_unlock_shared(inode);
+	return ret;
 }
 
 static int ext4_read_folio(struct file *file, struct folio *folio)
@@ -4687,8 +4702,7 @@ static inline int ext4_iget_extra_inode(struct inode *inode,
 	__le32 *magic = (void *)raw_inode +
 			EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
 
-	if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <=
-	    EXT4_INODE_SIZE(inode->i_sb) &&
+	if (EXT4_INODE_HAS_XATTR_SPACE(inode)  &&
 	    *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
 		ext4_set_inode_state(inode, EXT4_STATE_XATTR);
 		return ext4_find_inline_data_nolock(inode);
@@ -5215,7 +5229,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 
 	if (EXT4_SB(inode->i_sb)->s_journal) {
 		if (ext4_journal_current_handle()) {
-			jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+			ext4_debug("called recursively, non-PF_MEMALLOC!\n");
 			dump_stack();
 			return -EIO;
 		}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index cb01c1da0f9d..3cf3ec4b1c21 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -20,6 +20,7 @@
 #include <linux/delay.h>
 #include <linux/iversion.h>
 #include <linux/fileattr.h>
+#include <linux/uuid.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 #include <linux/fsmap.h>
@@ -41,6 +42,15 @@ static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg)
 	memcpy(es->s_volume_name, (char *)arg, EXT4_LABEL_MAX);
 }
 
+/*
+ * Superblock modification callback function for changing file system
+ * UUID.
+ */
+static void ext4_sb_setuuid(struct ext4_super_block *es, const void *arg)
+{
+	memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE);
+}
+
 static
 int ext4_update_primary_sb(struct super_block *sb, handle_t *handle,
 			   ext4_update_sb_callback func,
@@ -944,7 +954,9 @@ static long ext4_ioctl_group_add(struct file *file,
 	    test_opt(sb, INIT_INODE_TABLE))
 		err = ext4_register_li_request(sb, input->group);
 group_add_out:
-	ext4_resize_end(sb);
+	err2 = ext4_resize_end(sb, false);
+	if (err == 0)
+		err = err2;
 	return err;
 }
 
@@ -1131,6 +1143,73 @@ static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label
 	return 0;
 }
 
+static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi,
+			struct fsuuid __user *ufsuuid)
+{
+	struct fsuuid fsuuid;
+	__u8 uuid[UUID_SIZE];
+
+	if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid)))
+		return -EFAULT;
+
+	if (fsuuid.fsu_len == 0) {
+		fsuuid.fsu_len = UUID_SIZE;
+		if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid.fsu_len)))
+			return -EFAULT;
+		return -EINVAL;
+	}
+
+	if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0)
+		return -EINVAL;
+
+	lock_buffer(sbi->s_sbh);
+	memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE);
+	unlock_buffer(sbi->s_sbh);
+
+	if (copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE))
+		return -EFAULT;
+	return 0;
+}
+
+static int ext4_ioctl_setuuid(struct file *filp,
+			const struct fsuuid __user *ufsuuid)
+{
+	int ret = 0;
+	struct super_block *sb = file_inode(filp)->i_sb;
+	struct fsuuid fsuuid;
+	__u8 uuid[UUID_SIZE];
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/*
+	 * If any checksums (group descriptors or metadata) are being used
+	 * then the checksum seed feature is required to change the UUID.
+	 */
+	if (((ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb))
+			&& !ext4_has_feature_csum_seed(sb))
+		|| ext4_has_feature_stable_inodes(sb))
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid)))
+		return -EFAULT;
+
+	if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0)
+		return -EINVAL;
+
+	if (copy_from_user(uuid, &ufsuuid->fsu_uuid[0], UUID_SIZE))
+		return -EFAULT;
+
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
+	ret = ext4_update_superblocks_fn(sb, ext4_sb_setuuid, &uuid);
+	mnt_drop_write_file(filp);
+
+	return ret;
+}
+
 static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -1223,7 +1302,9 @@ setversion_out:
 			err = err2;
 		mnt_drop_write_file(filp);
 group_extend_out:
-		ext4_resize_end(sb);
+		err2 = ext4_resize_end(sb, false);
+		if (err == 0)
+			err = err2;
 		return err;
 	}
 
@@ -1371,7 +1452,9 @@ mext_out:
 			err = ext4_register_li_request(sb, o_group);
 
 resizefs_out:
-		ext4_resize_end(sb);
+		err2 = ext4_resize_end(sb, true);
+		if (err == 0)
+			err = err2;
 		return err;
 	}
 
@@ -1509,6 +1592,10 @@ resizefs_out:
 		return ext4_ioctl_setlabel(filp,
 					   (const void __user *)arg);
 
+	case EXT4_IOC_GETFSUUID:
+		return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg);
+	case EXT4_IOC_SETFSUUID:
+		return ext4_ioctl_setuuid(filp, (const void __user *)arg);
 	default:
 		return -ENOTTY;
 	}
@@ -1586,6 +1673,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case EXT4_IOC_CHECKPOINT:
 	case FS_IOC_GETFSLABEL:
 	case FS_IOC_SETFSLABEL:
+	case EXT4_IOC_GETFSUUID:
+	case EXT4_IOC_SETFSUUID:
 		break;
 	default:
 		return -ENOIOCTLCMD;
@@ -1599,13 +1688,15 @@ static void set_overhead(struct ext4_super_block *es, const void *arg)
 	es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg));
 }
 
-int ext4_update_overhead(struct super_block *sb)
+int ext4_update_overhead(struct super_block *sb, bool force)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (sb_rdonly(sb) || sbi->s_overhead == 0 ||
-	    sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters))
+	if (sb_rdonly(sb))
+		return 0;
+	if (!force &&
+	    (sbi->s_overhead == 0 ||
+	     sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters)))
 		return 0;
-
 	return ext4_update_superblocks_fn(sb, set_overhead, &sbi->s_overhead);
 }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9e06334771a3..bd8f8b5c3d30 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1933,6 +1933,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 	unsigned ret = 0;
 	int len0 = len;
 	void *buddy;
+	bool split = false;
 
 	BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
 	BUG_ON(e4b->bd_group != ex->fe_group);
@@ -1957,12 +1958,16 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 
 	/* let's maintain buddy itself */
 	while (len) {
-		ord = mb_find_order_for_block(e4b, start);
+		if (!split)
+			ord = mb_find_order_for_block(e4b, start);
 
 		if (((start >> ord) << ord) == start && len >= (1 << ord)) {
 			/* the whole chunk may be allocated at once! */
 			mlen = 1 << ord;
-			buddy = mb_find_buddy(e4b, ord, &max);
+			if (!split)
+				buddy = mb_find_buddy(e4b, ord, &max);
+			else
+				split = false;
 			BUG_ON((start >> ord) >= max);
 			mb_set_bit(start >> ord, buddy);
 			e4b->bd_info->bb_counters[ord]--;
@@ -1989,6 +1994,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 		mb_clear_bit(cur + 1, buddy);
 		e4b->bd_info->bb_counters[ord]++;
 		e4b->bd_info->bb_counters[ord]++;
+		split = true;
 	}
 	mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
 
@@ -5928,6 +5934,15 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
 
 	sbi = EXT4_SB(sb);
 
+	if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+	    !ext4_inode_block_valid(inode, block, count)) {
+		ext4_error(sb, "Freeing blocks in system zone - "
+			   "Block = %llu, count = %lu", block, count);
+		/* err = 0. ext4_std_error should be a no op */
+		goto error_return;
+	}
+	flags |= EXT4_FREE_BLOCKS_VALIDATED;
+
 do_more:
 	overflow = 0;
 	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -5944,6 +5959,8 @@ do_more:
 		overflow = EXT4_C2B(sbi, bit) + count -
 			EXT4_BLOCKS_PER_GROUP(sb);
 		count -= overflow;
+		/* The range changed so it's no longer validated */
+		flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
 	}
 	count_clusters = EXT4_NUM_B2C(sbi, count);
 	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
@@ -5958,7 +5975,8 @@ do_more:
 		goto error_return;
 	}
 
-	if (!ext4_inode_block_valid(inode, block, count)) {
+	if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+	    !ext4_inode_block_valid(inode, block, count)) {
 		ext4_error(sb, "Freeing blocks in system zone - "
 			   "Block = %llu, count = %lu", block, count);
 		/* err = 0. ext4_std_error should be a no op */
@@ -6081,6 +6099,8 @@ do_more:
 		block += count;
 		count = overflow;
 		put_bh(bitmap_bh);
+		/* The range changed so it's no longer validated */
+		flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
 		goto do_more;
 	}
 error_return:
@@ -6127,6 +6147,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			   "block = %llu, count = %lu", block, count);
 		return;
 	}
+	flags |= EXT4_FREE_BLOCKS_VALIDATED;
 
 	ext4_debug("freeing block %llu\n", block);
 	trace_ext4_free_blocks(inode, block, count, flags);
@@ -6158,6 +6179,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			block -= overflow;
 			count += overflow;
 		}
+		/* The range changed so it's no longer validated */
+		flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
 	}
 	overflow = EXT4_LBLK_COFF(sbi, count);
 	if (overflow) {
@@ -6168,6 +6191,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 				return;
 		} else
 			count += sbi->s_cluster_ratio - overflow;
+		/* The range changed so it's no longer validated */
+		flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
 	}
 
 	if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 42f590518b4c..54e7d3c95fd7 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -417,7 +417,7 @@ int ext4_ext_migrate(struct inode *inode)
 	struct inode *tmp_inode = NULL;
 	struct migrate_struct lb;
 	unsigned long max_entries;
-	__u32 goal;
+	__u32 goal, tmp_csum_seed;
 	uid_t owner[2];
 
 	/*
@@ -465,6 +465,7 @@ int ext4_ext_migrate(struct inode *inode)
 	 * the migration.
 	 */
 	ei = EXT4_I(inode);
+	tmp_csum_seed = EXT4_I(tmp_inode)->i_csum_seed;
 	EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed;
 	i_size_write(tmp_inode, i_size_read(inode));
 	/*
@@ -575,6 +576,7 @@ err_out:
 	 * the inode is not visible to user space.
 	 */
 	tmp_inode->i_blocks = 0;
+	EXT4_I(tmp_inode)->i_csum_seed = tmp_csum_seed;
 
 	/* Reset the extent details */
 	ext4_ext_tree_init(handle, tmp_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index db4ba99d1ceb..3a31b662f661 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -54,6 +54,7 @@ static struct buffer_head *ext4_append(handle_t *handle,
 					struct inode *inode,
 					ext4_lblk_t *block)
 {
+	struct ext4_map_blocks map;
 	struct buffer_head *bh;
 	int err;
 
@@ -63,6 +64,21 @@ static struct buffer_head *ext4_append(handle_t *handle,
 		return ERR_PTR(-ENOSPC);
 
 	*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+	map.m_lblk = *block;
+	map.m_len = 1;
+
+	/*
+	 * We're appending new directory block. Make sure the block is not
+	 * allocated yet, otherwise we will end up corrupting the
+	 * directory.
+	 */
+	err = ext4_map_blocks(NULL, inode, &map, 0);
+	if (err < 0)
+		return ERR_PTR(err);
+	if (err) {
+		EXT4_ERROR_INODE(inode, "Logical block already allocated");
+		return ERR_PTR(-EFSCORRUPTED);
+	}
 
 	bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE);
 	if (IS_ERR(bh))
@@ -110,6 +126,13 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 	struct ext4_dir_entry *dirent;
 	int is_dx_block = 0;
 
+	if (block >= inode->i_size) {
+		ext4_error_inode(inode, func, line, block,
+		       "Attempting to read directory block (%u) that is past i_size (%llu)",
+		       block, inode->i_size);
+		return ERR_PTR(-EFSCORRUPTED);
+	}
+
 	if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO))
 		bh = ERR_PTR(-EIO);
 	else
@@ -3067,11 +3090,8 @@ bool ext4_empty_dir(struct inode *inode)
 		de = (struct ext4_dir_entry_2 *) (bh->b_data +
 					(offset & (sb->s_blocksize - 1)));
 		if (ext4_check_dir_entry(inode, NULL, de, bh,
-					 bh->b_data, bh->b_size, offset)) {
-			offset = (offset | (sb->s_blocksize - 1)) + 1;
-			continue;
-		}
-		if (le32_to_cpu(de->inode)) {
+					 bh->b_data, bh->b_size, offset) ||
+		    le32_to_cpu(de->inode)) {
 			brelse(bh);
 			return false;
 		}
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index 7de0612eb42d..69a9cf9137a6 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -181,8 +181,8 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	} else
 		brelse(iloc.bh);
 
-	jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
-	jbd_debug(4, "orphan inode %lu will point to %d\n",
+	ext4_debug("superblock will point to %lu\n", inode->i_ino);
+	ext4_debug("orphan inode %lu will point to %d\n",
 			inode->i_ino, NEXT_ORPHAN(inode));
 out:
 	ext4_std_error(sb, err);
@@ -251,7 +251,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 	}
 
 	mutex_lock(&sbi->s_orphan_lock);
-	jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+	ext4_debug("remove inode %lu from orphan list\n", inode->i_ino);
 
 	prev = ei->i_orphan.prev;
 	list_del_init(&ei->i_orphan);
@@ -267,7 +267,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 
 	ino_next = NEXT_ORPHAN(inode);
 	if (prev == &sbi->s_orphan) {
-		jbd_debug(4, "superblock will point to %u\n", ino_next);
+		ext4_debug("superblock will point to %u\n", ino_next);
 		BUFFER_TRACE(sbi->s_sbh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, inode->i_sb,
 						    sbi->s_sbh, EXT4_JTR_NONE);
@@ -286,7 +286,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 		struct inode *i_prev =
 			&list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
 
-		jbd_debug(4, "orphan inode %lu will point to %u\n",
+		ext4_debug("orphan inode %lu will point to %u\n",
 			  i_prev->i_ino, ino_next);
 		err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
 		if (err) {
@@ -332,8 +332,8 @@ static void ext4_process_orphan(struct inode *inode,
 			ext4_msg(sb, KERN_DEBUG,
 				"%s: truncating inode %lu to %lld bytes",
 				__func__, inode->i_ino, inode->i_size);
-		jbd_debug(2, "truncating inode %lu to %lld bytes\n",
-			  inode->i_ino, inode->i_size);
+		ext4_debug("truncating inode %lu to %lld bytes\n",
+			   inode->i_ino, inode->i_size);
 		inode_lock(inode);
 		truncate_inode_pages(inode->i_mapping, inode->i_size);
 		ret = ext4_truncate(inode);
@@ -353,8 +353,8 @@ static void ext4_process_orphan(struct inode *inode,
 			ext4_msg(sb, KERN_DEBUG,
 				"%s: deleting unreferenced inode %lu",
 				__func__, inode->i_ino);
-		jbd_debug(2, "deleting unreferenced inode %lu\n",
-			  inode->i_ino);
+		ext4_debug("deleting unreferenced inode %lu\n",
+			   inode->i_ino);
 		(*nr_orphans)++;
 	}
 	iput(inode);  /* The delete magic happens here! */
@@ -391,7 +391,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
 	int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
 
 	if (!es->s_last_orphan && !oi->of_blocks) {
-		jbd_debug(4, "no orphan inodes to clean up\n");
+		ext4_debug("no orphan inodes to clean up\n");
 		return;
 	}
 
@@ -415,7 +415,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
 				  "clearing orphan list.\n");
 			es->s_last_orphan = 0;
 		}
-		jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+		ext4_debug("Skipping orphan recovery on fs with errors.\n");
 		return;
 	}
 
@@ -459,7 +459,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
 		 * so, skip the rest.
 		 */
 		if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
-			jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+			ext4_debug("Skipping orphan recovery on fs with errors.\n");
 			es->s_last_orphan = 0;
 			break;
 		}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 8b70a4701293..fea2a68d067b 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -97,10 +97,13 @@ int ext4_resize_begin(struct super_block *sb)
 	return ret;
 }
 
-void ext4_resize_end(struct super_block *sb)
+int ext4_resize_end(struct super_block *sb, bool update_backups)
 {
 	clear_bit_unlock(EXT4_FLAGS_RESIZING, &EXT4_SB(sb)->s_ext4_flags);
 	smp_mb__after_atomic();
+	if (update_backups)
+		return ext4_update_overhead(sb, true);
+	return 0;
 }
 
 static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
@@ -1380,6 +1383,17 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
 	return err;
 }
 
+static void ext4_add_overhead(struct super_block *sb,
+                              const ext4_fsblk_t overhead)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_super_block *es = sbi->s_es;
+
+       sbi->s_overhead += overhead;
+       es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead);
+       smp_wmb();
+}
+
 /*
  * ext4_update_super() updates the super block so that the newly added
  * groups can be seen by the filesystem.
@@ -1481,9 +1495,18 @@ static void ext4_update_super(struct super_block *sb,
 	}
 
 	/*
-	 * Update the fs overhead information
+	 * Update the fs overhead information.
+	 *
+	 * For bigalloc, if the superblock already has a properly calculated
+	 * overhead, update it with a value based on numbers already computed
+	 * above for the newly allocated capacity.
 	 */
-	ext4_calculate_overhead(sb);
+	if (ext4_has_feature_bigalloc(sb) && (sbi->s_overhead != 0))
+		ext4_add_overhead(sb,
+			EXT4_NUM_B2C(sbi, blocks_count - free_blocks));
+	else
+		ext4_calculate_overhead(sb);
+	es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead);
 
 	if (test_opt(sb, DEBUG))
 		printk(KERN_DEBUG "EXT4-fs: added group %u:"
@@ -1988,6 +2011,16 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
 	}
 	brelse(bh);
 
+	/*
+	 * For bigalloc, trim the requested size to the nearest cluster
+	 * boundary to avoid creating an unusable filesystem. We do this
+	 * silently, instead of returning an error, to avoid breaking
+	 * callers that blindly resize the filesystem to the full size of
+	 * the underlying block device.
+	 */
+	if (ext4_has_feature_bigalloc(sb))
+		n_blocks_count &= ~((1 << EXT4_CLUSTER_BITS(sb)) - 1);
+
 retry:
 	o_blocks_count = ext4_blocks_count(es);
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2c68dec63e54..8f907e9d9c76 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3011,6 +3011,15 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
 	} else if (test_opt2(sb, DAX_INODE)) {
 		SEQ_OPTS_PUTS("dax=inode");
 	}
+
+	if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
+			!test_opt2(sb, MB_OPTIMIZE_SCAN)) {
+		SEQ_OPTS_PUTS("mb_optimize_scan=0");
+	} else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
+			test_opt2(sb, MB_OPTIMIZE_SCAN)) {
+		SEQ_OPTS_PUTS("mb_optimize_scan=1");
+	}
+
 	ext4_show_quota_options(seq, sb);
 	return 0;
 }
@@ -5523,7 +5532,7 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
 			 "Quota mode: %s.", descr, ext4_quota_mode(sb));
 
 	/* Update the s_overhead_clusters if necessary */
-	ext4_update_overhead(sb);
+	ext4_update_overhead(sb, false);
 	return 0;
 
 free_sbi:
@@ -5585,7 +5594,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
 		return NULL;
 	}
 
-	jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
+	ext4_debug("Journal inode found at %p: %lld bytes\n",
 		  journal_inode, journal_inode->i_size);
 	if (!S_ISREG(journal_inode->i_mode)) {
 		ext4_msg(sb, KERN_ERR, "invalid journal inode");
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index d281f5bcc526..3d3ed3c38f56 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -74,6 +74,21 @@ static const char *ext4_get_link(struct dentry *dentry, struct inode *inode,
 				 struct delayed_call *callback)
 {
 	struct buffer_head *bh;
+	char *inline_link;
+
+	/*
+	 * Create a new inlined symlink is not supported, just provide a
+	 * method to read the leftovers.
+	 */
+	if (ext4_has_inline_data(inode)) {
+		if (!dentry)
+			return ERR_PTR(-ECHILD);
+
+		inline_link = ext4_read_inline_link(inode);
+		if (!IS_ERR(inline_link))
+			set_delayed_call(callback, kfree_link, inline_link);
+		return inline_link;
+	}
 
 	if (!dentry) {
 		bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 564e28a1aa94..533216e80fa2 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -436,6 +436,21 @@ error:
 	return err;
 }
 
+/* Remove entry from mbcache when EA inode is getting evicted */
+void ext4_evict_ea_inode(struct inode *inode)
+{
+	struct mb_cache_entry *oe;
+
+	if (!EA_INODE_CACHE(inode))
+		return;
+	/* Wait for entry to get unused so that we can remove it */
+	while ((oe = mb_cache_entry_delete_or_get(EA_INODE_CACHE(inode),
+			ext4_xattr_inode_get_hash(inode), inode->i_ino))) {
+		mb_cache_entry_wait_unused(oe);
+		mb_cache_entry_put(EA_INODE_CACHE(inode), oe);
+	}
+}
+
 static int
 ext4_xattr_inode_verify_hashes(struct inode *ea_inode,
 			       struct ext4_xattr_entry *entry, void *buffer,
@@ -976,10 +991,8 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
 static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
 				       int ref_change)
 {
-	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
 	struct ext4_iloc iloc;
 	s64 ref_count;
-	u32 hash;
 	int ret;
 
 	inode_lock(ea_inode);
@@ -1002,14 +1015,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
 
 			set_nlink(ea_inode, 1);
 			ext4_orphan_del(handle, ea_inode);
-
-			if (ea_inode_cache) {
-				hash = ext4_xattr_inode_get_hash(ea_inode);
-				mb_cache_entry_create(ea_inode_cache,
-						      GFP_NOFS, hash,
-						      ea_inode->i_ino,
-						      true /* reusable */);
-			}
 		}
 	} else {
 		WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
@@ -1022,12 +1027,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
 
 			clear_nlink(ea_inode);
 			ext4_orphan_add(handle, ea_inode);
-
-			if (ea_inode_cache) {
-				hash = ext4_xattr_inode_get_hash(ea_inode);
-				mb_cache_entry_delete(ea_inode_cache, hash,
-						      ea_inode->i_ino);
-			}
 		}
 	}
 
@@ -1237,6 +1236,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 	if (error)
 		goto out;
 
+retry_ref:
 	lock_buffer(bh);
 	hash = le32_to_cpu(BHDR(bh)->h_hash);
 	ref = le32_to_cpu(BHDR(bh)->h_refcount);
@@ -1246,9 +1246,18 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		 * This must happen under buffer lock for
 		 * ext4_xattr_block_set() to reliably detect freed block
 		 */
-		if (ea_block_cache)
-			mb_cache_entry_delete(ea_block_cache, hash,
-					      bh->b_blocknr);
+		if (ea_block_cache) {
+			struct mb_cache_entry *oe;
+
+			oe = mb_cache_entry_delete_or_get(ea_block_cache, hash,
+							  bh->b_blocknr);
+			if (oe) {
+				unlock_buffer(bh);
+				mb_cache_entry_wait_unused(oe);
+				mb_cache_entry_put(ea_block_cache, oe);
+				goto retry_ref;
+			}
+		}
 		get_bh(bh);
 		unlock_buffer(bh);
 
@@ -1858,6 +1867,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 #define header(x) ((struct ext4_xattr_header *)(x))
 
 	if (s->base) {
+		int offset = (char *)s->here - bs->bh->b_data;
+
 		BUFFER_TRACE(bs->bh, "get_write_access");
 		error = ext4_journal_get_write_access(handle, sb, bs->bh,
 						      EXT4_JTR_NONE);
@@ -1873,9 +1884,20 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			 * ext4_xattr_block_set() to reliably detect modified
 			 * block
 			 */
-			if (ea_block_cache)
-				mb_cache_entry_delete(ea_block_cache, hash,
-						      bs->bh->b_blocknr);
+			if (ea_block_cache) {
+				struct mb_cache_entry *oe;
+
+				oe = mb_cache_entry_delete_or_get(ea_block_cache,
+					hash, bs->bh->b_blocknr);
+				if (oe) {
+					/*
+					 * Xattr block is getting reused. Leave
+					 * it alone.
+					 */
+					mb_cache_entry_put(ea_block_cache, oe);
+					goto clone_block;
+				}
+			}
 			ea_bdebug(bs->bh, "modifying in-place");
 			error = ext4_xattr_set_entry(i, s, handle, inode,
 						     true /* is_block */);
@@ -1890,49 +1912,47 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			if (error)
 				goto cleanup;
 			goto inserted;
-		} else {
-			int offset = (char *)s->here - bs->bh->b_data;
+		}
+clone_block:
+		unlock_buffer(bs->bh);
+		ea_bdebug(bs->bh, "cloning");
+		s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS);
+		error = -ENOMEM;
+		if (s->base == NULL)
+			goto cleanup;
+		s->first = ENTRY(header(s->base)+1);
+		header(s->base)->h_refcount = cpu_to_le32(1);
+		s->here = ENTRY(s->base + offset);
+		s->end = s->base + bs->bh->b_size;
 
-			unlock_buffer(bs->bh);
-			ea_bdebug(bs->bh, "cloning");
-			s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS);
-			error = -ENOMEM;
-			if (s->base == NULL)
+		/*
+		 * If existing entry points to an xattr inode, we need
+		 * to prevent ext4_xattr_set_entry() from decrementing
+		 * ref count on it because the reference belongs to the
+		 * original block. In this case, make the entry look
+		 * like it has an empty value.
+		 */
+		if (!s->not_found && s->here->e_value_inum) {
+			ea_ino = le32_to_cpu(s->here->e_value_inum);
+			error = ext4_xattr_inode_iget(inode, ea_ino,
+				      le32_to_cpu(s->here->e_hash),
+				      &tmp_inode);
+			if (error)
 				goto cleanup;
-			s->first = ENTRY(header(s->base)+1);
-			header(s->base)->h_refcount = cpu_to_le32(1);
-			s->here = ENTRY(s->base + offset);
-			s->end = s->base + bs->bh->b_size;
 
-			/*
-			 * If existing entry points to an xattr inode, we need
-			 * to prevent ext4_xattr_set_entry() from decrementing
-			 * ref count on it because the reference belongs to the
-			 * original block. In this case, make the entry look
-			 * like it has an empty value.
-			 */
-			if (!s->not_found && s->here->e_value_inum) {
-				ea_ino = le32_to_cpu(s->here->e_value_inum);
-				error = ext4_xattr_inode_iget(inode, ea_ino,
-					      le32_to_cpu(s->here->e_hash),
-					      &tmp_inode);
-				if (error)
-					goto cleanup;
-
-				if (!ext4_test_inode_state(tmp_inode,
-						EXT4_STATE_LUSTRE_EA_INODE)) {
-					/*
-					 * Defer quota free call for previous
-					 * inode until success is guaranteed.
-					 */
-					old_ea_inode_quota = le32_to_cpu(
-							s->here->e_value_size);
-				}
-				iput(tmp_inode);
-
-				s->here->e_value_inum = 0;
-				s->here->e_value_size = 0;
+			if (!ext4_test_inode_state(tmp_inode,
+					EXT4_STATE_LUSTRE_EA_INODE)) {
+				/*
+				 * Defer quota free call for previous
+				 * inode until success is guaranteed.
+				 */
+				old_ea_inode_quota = le32_to_cpu(
+						s->here->e_value_size);
 			}
+			iput(tmp_inode);
+
+			s->here->e_value_inum = 0;
+			s->here->e_value_size = 0;
 		}
 	} else {
 		/* Allocate a buffer where we construct the new block. */
@@ -1999,18 +2019,13 @@ inserted:
 				lock_buffer(new_bh);
 				/*
 				 * We have to be careful about races with
-				 * freeing, rehashing or adding references to
-				 * xattr block. Once we hold buffer lock xattr
-				 * block's state is stable so we can check
-				 * whether the block got freed / rehashed or
-				 * not.  Since we unhash mbcache entry under
-				 * buffer lock when freeing / rehashing xattr
-				 * block, checking whether entry is still
-				 * hashed is reliable. Same rules hold for
-				 * e_reusable handling.
+				 * adding references to xattr block. Once we
+				 * hold buffer lock xattr block's state is
+				 * stable so we can check the additional
+				 * reference fits.
 				 */
-				if (hlist_bl_unhashed(&ce->e_hash_list) ||
-				    !ce->e_reusable) {
+				ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
+				if (ref > EXT4_XATTR_REFCOUNT_MAX) {
 					/*
 					 * Undo everything and check mbcache
 					 * again.
@@ -2025,9 +2040,8 @@ inserted:
 					new_bh = NULL;
 					goto inserted;
 				}
-				ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
 				BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
-				if (ref >= EXT4_XATTR_REFCOUNT_MAX)
+				if (ref == EXT4_XATTR_REFCOUNT_MAX)
 					ce->e_reusable = 0;
 				ea_bdebug(new_bh, "reusing; refcount now=%d",
 					  ref);
@@ -2175,8 +2189,9 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
 	struct ext4_inode *raw_inode;
 	int error;
 
-	if (EXT4_I(inode)->i_extra_isize == 0)
+	if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
 		return 0;
+
 	raw_inode = ext4_raw_inode(&is->iloc);
 	header = IHDR(inode, raw_inode);
 	is->s.base = is->s.first = IFIRST(header);
@@ -2204,8 +2219,9 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 	struct ext4_xattr_search *s = &is->s;
 	int error;
 
-	if (EXT4_I(inode)->i_extra_isize == 0)
+	if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
 		return -ENOSPC;
+
 	error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */);
 	if (error)
 		return error;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 77efb9a627ad..824faf0b15a8 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -84,7 +84,7 @@ struct ext4_xattr_entry {
 /*
  * The minimum size of EA value when you start storing it in an external inode
  * size of block - size of header - size of 1 entry - 4 null bytes
-*/
+ */
 #define EXT4_XATTR_MIN_LARGE_EA_SIZE(b)					\
 	((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4)
 
@@ -95,6 +95,19 @@ struct ext4_xattr_entry {
 
 #define EXT4_ZERO_XATTR_VALUE ((void *)-1)
 
+/*
+ * If we want to add an xattr to the inode, we should make sure that
+ * i_extra_isize is not 0 and that the inode size is not less than
+ * EXT4_GOOD_OLD_INODE_SIZE + extra_isize + pad.
+ *   EXT4_GOOD_OLD_INODE_SIZE   extra_isize header   entry   pad  data
+ * |--------------------------|------------|------|---------|---|-------|
+ */
+#define EXT4_INODE_HAS_XATTR_SPACE(inode)				\
+	((EXT4_I(inode)->i_extra_isize != 0) &&				\
+	 (EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize +	\
+	  sizeof(struct ext4_xattr_ibody_header) + EXT4_XATTR_PAD <=	\
+	  EXT4_INODE_SIZE((inode)->i_sb)))
+
 struct ext4_xattr_info {
 	const char *name;
 	const void *value;
@@ -178,6 +191,7 @@ extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
 
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 			    struct ext4_inode *raw_inode, handle_t *handle);
+extern void ext4_evict_ea_inode(struct inode *inode);
 
 extern const struct xattr_handler *ext4_xattr_handlers[];
 
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 746132998c57..51bd38da21cd 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -203,7 +203,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
 	tid_t			this_tid;
 	int			result, batch_count = 0;
 
-	jbd_debug(1, "Start checkpoint\n");
+	jbd2_debug(1, "Start checkpoint\n");
 
 	/*
 	 * First thing: if there are any transactions in the log which
@@ -212,7 +212,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
 	 */
 	result = jbd2_cleanup_journal_tail(journal);
 	trace_jbd2_checkpoint(journal, result);
-	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
+	jbd2_debug(1, "cleanup_journal_tail returned %d\n", result);
 	if (result <= 0)
 		return result;
 
@@ -804,5 +804,5 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
 
 	trace_jbd2_drop_transaction(journal, transaction);
 
-	jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
+	jbd2_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 890b5543a1c5..b2b2bc9b88d9 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -421,7 +421,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
 	if (journal->j_flags & JBD2_FLUSHED) {
-		jbd_debug(3, "super block updated\n");
+		jbd2_debug(3, "super block updated\n");
 		mutex_lock_io(&journal->j_checkpoint_mutex);
 		/*
 		 * We hold j_checkpoint_mutex so tail cannot change under us.
@@ -435,7 +435,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 						REQ_SYNC);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	} else {
-		jbd_debug(3, "superblock not updated\n");
+		jbd2_debug(3, "superblock not updated\n");
 	}
 
 	J_ASSERT(journal->j_running_transaction != NULL);
@@ -467,7 +467,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	commit_transaction = journal->j_running_transaction;
 
 	trace_jbd2_start_commit(journal, commit_transaction);
-	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
+	jbd2_debug(1, "JBD2: starting commit of transaction %d\n",
 			commit_transaction->t_tid);
 
 	write_lock(&journal->j_state_lock);
@@ -540,7 +540,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	__jbd2_journal_clean_checkpoint_list(journal, false);
 	spin_unlock(&journal->j_list_lock);
 
-	jbd_debug(3, "JBD2: commit phase 1\n");
+	jbd2_debug(3, "JBD2: commit phase 1\n");
 
 	/*
 	 * Clear revoked flag to reflect there is no revoked buffers
@@ -553,13 +553,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 */
 	jbd2_journal_switch_revoke_table(journal);
 
+	write_lock(&journal->j_state_lock);
 	/*
 	 * Reserved credits cannot be claimed anymore, free them
 	 */
 	atomic_sub(atomic_read(&journal->j_reserved_credits),
 		   &commit_transaction->t_outstanding_credits);
 
-	write_lock(&journal->j_state_lock);
 	trace_jbd2_commit_flushing(journal, commit_transaction);
 	stats.run.rs_flushing = jiffies;
 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
@@ -573,7 +573,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	wake_up(&journal->j_wait_transaction_locked);
 	write_unlock(&journal->j_state_lock);
 
-	jbd_debug(3, "JBD2: commit phase 2a\n");
+	jbd2_debug(3, "JBD2: commit phase 2a\n");
 
 	/*
 	 * Now start flushing things to disk, in the order they appear
@@ -586,7 +586,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	blk_start_plug(&plug);
 	jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
 
-	jbd_debug(3, "JBD2: commit phase 2b\n");
+	jbd2_debug(3, "JBD2: commit phase 2b\n");
 
 	/*
 	 * Way to go: we have now written out all of the data for a
@@ -642,7 +642,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		if (!descriptor) {
 			J_ASSERT (bufs == 0);
 
-			jbd_debug(4, "JBD2: get descriptor\n");
+			jbd2_debug(4, "JBD2: get descriptor\n");
 
 			descriptor = jbd2_journal_get_descriptor_buffer(
 							commit_transaction,
@@ -652,7 +652,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 				continue;
 			}
 
-			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
+			jbd2_debug(4, "JBD2: got buffer %llu (%p)\n",
 				(unsigned long long)descriptor->b_blocknr,
 				descriptor->b_data);
 			tagp = &descriptor->b_data[sizeof(journal_header_t)];
@@ -737,7 +737,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		    commit_transaction->t_buffers == NULL ||
 		    space_left < tag_bytes + 16 + csum_size) {
 
-			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
+			jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs);
 
 			/* Write an end-of-descriptor marker before
                            submitting the IOs.  "tag" still points to
@@ -839,7 +839,7 @@ start_journal_io:
 	   so we incur less scheduling load.
 	*/
 
-	jbd_debug(3, "JBD2: commit phase 3\n");
+	jbd2_debug(3, "JBD2: commit phase 3\n");
 
 	while (!list_empty(&io_bufs)) {
 		struct buffer_head *bh = list_entry(io_bufs.prev,
@@ -882,7 +882,7 @@ start_journal_io:
 
 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
 
-	jbd_debug(3, "JBD2: commit phase 4\n");
+	jbd2_debug(3, "JBD2: commit phase 4\n");
 
 	/* Here we wait for the revoke record and descriptor record buffers */
 	while (!list_empty(&log_bufs)) {
@@ -906,7 +906,7 @@ start_journal_io:
 	if (err)
 		jbd2_journal_abort(journal, err);
 
-	jbd_debug(3, "JBD2: commit phase 5\n");
+	jbd2_debug(3, "JBD2: commit phase 5\n");
 	write_lock(&journal->j_state_lock);
 	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
 	commit_transaction->t_state = T_COMMIT_JFLUSH;
@@ -945,7 +945,7 @@ start_journal_io:
            transaction can be removed from any checkpoint list it was on
            before. */
 
-	jbd_debug(3, "JBD2: commit phase 6\n");
+	jbd2_debug(3, "JBD2: commit phase 6\n");
 
 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 	J_ASSERT(commit_transaction->t_buffers == NULL);
@@ -1122,7 +1122,7 @@ restart_loop:
 
 	/* Done with this transaction! */
 
-	jbd_debug(3, "JBD2: commit phase 7\n");
+	jbd2_debug(3, "JBD2: commit phase 7\n");
 
 	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
 
@@ -1164,7 +1164,7 @@ restart_loop:
 		journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid);
 
 	trace_jbd2_end_commit(journal, commit_transaction);
-	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
+	jbd2_debug(1, "JBD2: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
 
 	write_lock(&journal->j_state_lock);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2a1b9da7c3e3..b083961ea383 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -49,8 +49,7 @@
 #include <asm/page.h>
 
 #ifdef CONFIG_JBD2_DEBUG
-ushort jbd2_journal_enable_debug __read_mostly;
-EXPORT_SYMBOL(jbd2_journal_enable_debug);
+static ushort jbd2_journal_enable_debug __read_mostly;
 
 module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
 MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2");
@@ -81,7 +80,6 @@ EXPORT_SYMBOL(jbd2_journal_errno);
 EXPORT_SYMBOL(jbd2_journal_ack_err);
 EXPORT_SYMBOL(jbd2_journal_clear_err);
 EXPORT_SYMBOL(jbd2_log_wait_commit);
-EXPORT_SYMBOL(jbd2_log_start_commit);
 EXPORT_SYMBOL(jbd2_journal_start_commit);
 EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
 EXPORT_SYMBOL(jbd2_journal_wipe);
@@ -115,7 +113,6 @@ void __jbd2_debug(int level, const char *file, const char *func,
 	printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf);
 	va_end(args);
 }
-EXPORT_SYMBOL(__jbd2_debug);
 #endif
 
 /* Checksumming functions */
@@ -203,11 +200,11 @@ loop:
 	if (journal->j_flags & JBD2_UNMOUNT)
 		goto end_loop;
 
-	jbd_debug(1, "commit_sequence=%u, commit_request=%u\n",
+	jbd2_debug(1, "commit_sequence=%u, commit_request=%u\n",
 		journal->j_commit_sequence, journal->j_commit_request);
 
 	if (journal->j_commit_sequence != journal->j_commit_request) {
-		jbd_debug(1, "OK, requests differ\n");
+		jbd2_debug(1, "OK, requests differ\n");
 		write_unlock(&journal->j_state_lock);
 		del_timer_sync(&journal->j_commit_timer);
 		jbd2_journal_commit_transaction(journal);
@@ -222,7 +219,7 @@ loop:
 		 * good idea, because that depends on threads that may
 		 * be already stopped.
 		 */
-		jbd_debug(1, "Now suspending kjournald2\n");
+		jbd2_debug(1, "Now suspending kjournald2\n");
 		write_unlock(&journal->j_state_lock);
 		try_to_freeze();
 		write_lock(&journal->j_state_lock);
@@ -252,7 +249,7 @@ loop:
 		finish_wait(&journal->j_wait_commit, &wait);
 	}
 
-	jbd_debug(1, "kjournald2 wakes\n");
+	jbd2_debug(1, "kjournald2 wakes\n");
 
 	/*
 	 * Were we woken up by a commit wakeup event?
@@ -260,7 +257,7 @@ loop:
 	transaction = journal->j_running_transaction;
 	if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
 		journal->j_commit_request = transaction->t_tid;
-		jbd_debug(1, "woke because of timeout\n");
+		jbd2_debug(1, "woke because of timeout\n");
 	}
 	goto loop;
 
@@ -268,7 +265,7 @@ end_loop:
 	del_timer_sync(&journal->j_commit_timer);
 	journal->j_task = NULL;
 	wake_up(&journal->j_wait_done_commit);
-	jbd_debug(1, "Journal thread exiting.\n");
+	jbd2_debug(1, "Journal thread exiting.\n");
 	write_unlock(&journal->j_state_lock);
 	return 0;
 }
@@ -481,7 +478,7 @@ repeat:
  * Called with j_state_lock locked for writing.
  * Returns true if a transaction commit was started.
  */
-int __jbd2_log_start_commit(journal_t *journal, tid_t target)
+static int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 {
 	/* Return if the txn has already requested to be committed */
 	if (journal->j_commit_request == target)
@@ -500,7 +497,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 		 */
 
 		journal->j_commit_request = target;
-		jbd_debug(1, "JBD2: requesting commit %u/%u\n",
+		jbd2_debug(1, "JBD2: requesting commit %u/%u\n",
 			  journal->j_commit_request,
 			  journal->j_commit_sequence);
 		journal->j_running_transaction->t_requested = jiffies;
@@ -705,7 +702,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	}
 #endif
 	while (tid_gt(tid, journal->j_commit_sequence)) {
-		jbd_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
+		jbd2_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
 				  tid, journal->j_commit_sequence);
 		read_unlock(&journal->j_state_lock);
 		wake_up(&journal->j_wait_commit);
@@ -1117,7 +1114,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 		freed += journal->j_last - journal->j_first;
 
 	trace_jbd2_update_log_tail(journal, tid, block, freed);
-	jbd_debug(1,
+	jbd2_debug(1,
 		  "Cleaning journal tail from %u to %u (offset %lu), "
 		  "freeing %lu\n",
 		  journal->j_tail_sequence, tid, block, freed);
@@ -1497,7 +1494,7 @@ journal_t *jbd2_journal_init_inode(struct inode *inode)
 		return NULL;
 	}
 
-	jbd_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
+	jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
 		  inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size,
 		  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
 
@@ -1577,7 +1574,7 @@ static int journal_reset(journal_t *journal)
 	 * attempting a write to a potential-readonly device.
 	 */
 	if (sb->s_start == 0) {
-		jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
+		jbd2_debug(1, "JBD2: Skipping superblock update on recovered sb "
 			"(start %ld, seq %u, errno %d)\n",
 			journal->j_tail, journal->j_tail_sequence,
 			journal->j_errno);
@@ -1681,7 +1678,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
 	}
 
 	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
-	jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
+	jbd2_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
 		  tail_block, tail_tid);
 
 	lock_buffer(journal->j_sb_buffer);
@@ -1722,7 +1719,7 @@ static void jbd2_mark_journal_empty(journal_t *journal, blk_opf_t write_flags)
 		return;
 	}
 
-	jbd_debug(1, "JBD2: Marking journal as empty (seq %u)\n",
+	jbd2_debug(1, "JBD2: Marking journal as empty (seq %u)\n",
 		  journal->j_tail_sequence);
 
 	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1865,7 +1862,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
 	errcode = journal->j_errno;
 	if (errcode == -ESHUTDOWN)
 		errcode = 0;
-	jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
+	jbd2_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
 	sb->s_errno    = cpu_to_be32(errcode);
 
 	jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA);
@@ -2337,7 +2334,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
 	    compat & JBD2_FEATURE_COMPAT_CHECKSUM)
 		compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
 
-	jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
+	jbd2_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
 		  compat, ro, incompat);
 
 	sb = journal->j_superblock;
@@ -2406,7 +2403,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
 {
 	journal_superblock_t *sb;
 
-	jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
+	jbd2_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
 		  compat, ro, incompat);
 
 	sb = journal->j_superblock;
@@ -2863,7 +2860,7 @@ static struct journal_head *journal_alloc_journal_head(void)
 #endif
 	ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
 	if (!ret) {
-		jbd_debug(1, "out of memory for journal_head\n");
+		jbd2_debug(1, "out of memory for journal_head\n");
 		pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
 		ret = kmem_cache_zalloc(jbd2_journal_head_cache,
 				GFP_NOFS | __GFP_NOFAIL);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index e699d6ab2c0e..f548479615c6 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -245,11 +245,11 @@ static int fc_do_one_pass(journal_t *journal,
 		return 0;
 
 	while (next_fc_block <= journal->j_fc_last) {
-		jbd_debug(3, "Fast commit replay: next block %ld\n",
+		jbd2_debug(3, "Fast commit replay: next block %ld\n",
 			  next_fc_block);
 		err = jread(&bh, journal, next_fc_block);
 		if (err) {
-			jbd_debug(3, "Fast commit replay: read error\n");
+			jbd2_debug(3, "Fast commit replay: read error\n");
 			break;
 		}
 
@@ -263,7 +263,7 @@ static int fc_do_one_pass(journal_t *journal,
 	}
 
 	if (err)
-		jbd_debug(3, "Fast commit replay failed, err = %d\n", err);
+		jbd2_debug(3, "Fast commit replay failed, err = %d\n", err);
 
 	return err;
 }
@@ -297,7 +297,7 @@ int jbd2_journal_recover(journal_t *journal)
 	 */
 
 	if (!sb->s_start) {
-		jbd_debug(1, "No recovery required, last transaction %d\n",
+		jbd2_debug(1, "No recovery required, last transaction %d\n",
 			  be32_to_cpu(sb->s_sequence));
 		journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
 		return 0;
@@ -309,10 +309,10 @@ int jbd2_journal_recover(journal_t *journal)
 	if (!err)
 		err = do_one_pass(journal, &info, PASS_REPLAY);
 
-	jbd_debug(1, "JBD2: recovery, exit status %d, "
+	jbd2_debug(1, "JBD2: recovery, exit status %d, "
 		  "recovered transactions %u to %u\n",
 		  err, info.start_transaction, info.end_transaction);
-	jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
+	jbd2_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
 		  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
 
 	/* Restart the log at the next transaction ID, thus invalidating
@@ -362,7 +362,7 @@ int jbd2_journal_skip_recovery(journal_t *journal)
 #ifdef CONFIG_JBD2_DEBUG
 		int dropped = info.end_transaction - 
 			be32_to_cpu(journal->j_superblock->s_sequence);
-		jbd_debug(1,
+		jbd2_debug(1,
 			  "JBD2: ignoring %d transaction%s from the journal.\n",
 			  dropped, (dropped == 1) ? "" : "s");
 #endif
@@ -484,7 +484,7 @@ static int do_one_pass(journal_t *journal,
 	if (pass == PASS_SCAN)
 		info->start_transaction = first_commit_ID;
 
-	jbd_debug(1, "Starting recovery pass %d\n", pass);
+	jbd2_debug(1, "Starting recovery pass %d\n", pass);
 
 	/*
 	 * Now we walk through the log, transaction by transaction,
@@ -510,7 +510,7 @@ static int do_one_pass(journal_t *journal,
 			if (tid_geq(next_commit_ID, info->end_transaction))
 				break;
 
-		jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+		jbd2_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
 			  next_commit_ID, next_log_block,
 			  jbd2_has_feature_fast_commit(journal) ?
 			  journal->j_fc_last : journal->j_last);
@@ -519,7 +519,7 @@ static int do_one_pass(journal_t *journal,
 		 * either the next descriptor block or the final commit
 		 * record. */
 
-		jbd_debug(3, "JBD2: checking block %ld\n", next_log_block);
+		jbd2_debug(3, "JBD2: checking block %ld\n", next_log_block);
 		err = jread(&bh, journal, next_log_block);
 		if (err)
 			goto failed;
@@ -542,7 +542,7 @@ static int do_one_pass(journal_t *journal,
 
 		blocktype = be32_to_cpu(tmp->h_blocktype);
 		sequence = be32_to_cpu(tmp->h_sequence);
-		jbd_debug(3, "Found magic %d, sequence %d\n",
+		jbd2_debug(3, "Found magic %d, sequence %d\n",
 			  blocktype, sequence);
 
 		if (sequence != next_commit_ID) {
@@ -575,7 +575,7 @@ static int do_one_pass(journal_t *journal,
 					goto failed;
 				}
 				need_check_commit_time = true;
-				jbd_debug(1,
+				jbd2_debug(1,
 					"invalid descriptor block found in %lu\n",
 					next_log_block);
 			}
@@ -758,7 +758,7 @@ static int do_one_pass(journal_t *journal,
 				 * It likely does not belong to same journal,
 				 * just end this recovery with success.
 				 */
-				jbd_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n",
+				jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n",
 					  next_commit_ID);
 				brelse(bh);
 				goto done;
@@ -826,7 +826,7 @@ static int do_one_pass(journal_t *journal,
 			if (pass == PASS_SCAN &&
 			    !jbd2_descriptor_block_csum_verify(journal,
 							       bh->b_data)) {
-				jbd_debug(1, "JBD2: invalid revoke block found in %lu\n",
+				jbd2_debug(1, "JBD2: invalid revoke block found in %lu\n",
 					  next_log_block);
 				need_check_commit_time = true;
 			}
@@ -845,7 +845,7 @@ static int do_one_pass(journal_t *journal,
 			continue;
 
 		default:
-			jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
+			jbd2_debug(3, "Unrecognised magic %d, end of scan.\n",
 				  blocktype);
 			brelse(bh);
 			goto done;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index fa608788b93d..4556e4689024 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -398,7 +398,7 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
 	}
 	handle->h_revoke_credits--;
 
-	jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
+	jbd2_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
 	err = insert_revoke_hash(journal, blocknr,
 				handle->h_transaction->t_tid);
 	BUFFER_TRACE(bh_in, "exit");
@@ -428,7 +428,7 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 	int did_revoke = 0;	/* akpm: debug */
 	struct buffer_head *bh = jh2bh(jh);
 
-	jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
+	jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh);
 
 	/* Is the existing Revoke bit valid?  If so, we trust it, and
 	 * only perform the full cancel if the revoke bit is set.  If
@@ -444,7 +444,7 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 	if (need_cancel) {
 		record = find_revoke_record(journal, bh->b_blocknr);
 		if (record) {
-			jbd_debug(4, "cancelled existing revoke on "
+			jbd2_debug(4, "cancelled existing revoke on "
 				  "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
 			spin_lock(&journal->j_revoke_lock);
 			list_del(&record->hash);
@@ -560,7 +560,7 @@ void jbd2_journal_write_revoke_records(transaction_t *transaction,
 	}
 	if (descriptor)
 		flush_descriptor(journal, descriptor, offset);
-	jbd_debug(1, "Wrote %d revoke records\n", count);
+	jbd2_debug(1, "Wrote %d revoke records\n", count);
 }
 
 /*
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e9c308ae475f..e1be93ccd81c 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -373,7 +373,7 @@ alloc_transaction:
 			return -ENOMEM;
 	}
 
-	jbd_debug(3, "New handle %p going live.\n", handle);
+	jbd2_debug(3, "New handle %p going live.\n", handle);
 
 	/*
 	 * We need to hold j_state_lock until t_updates has been incremented,
@@ -453,7 +453,7 @@ repeat:
 	handle->h_start_jiffies = jiffies;
 	atomic_inc(&transaction->t_updates);
 	atomic_inc(&transaction->t_handle_count);
-	jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
+	jbd2_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
 		  handle, blocks,
 		  atomic_read(&transaction->t_outstanding_credits),
 		  jbd2_log_space_left(journal));
@@ -674,7 +674,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
 
 	/* Don't extend a locked-down transaction! */
 	if (transaction->t_state != T_RUNNING) {
-		jbd_debug(3, "denied handle %p %d blocks: "
+		jbd2_debug(3, "denied handle %p %d blocks: "
 			  "transaction not running\n", handle, nblocks);
 		goto error_out;
 	}
@@ -689,7 +689,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
 				   &transaction->t_outstanding_credits);
 
 	if (wanted > journal->j_max_transaction_buffers) {
-		jbd_debug(3, "denied handle %p %d blocks: "
+		jbd2_debug(3, "denied handle %p %d blocks: "
 			  "transaction too large\n", handle, nblocks);
 		atomic_sub(nblocks, &transaction->t_outstanding_credits);
 		goto error_out;
@@ -707,7 +707,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
 	handle->h_revoke_credits_requested += revoke_records;
 	result = 0;
 
-	jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
+	jbd2_debug(3, "extended handle %p by %d\n", handle, nblocks);
 error_out:
 	read_unlock(&journal->j_state_lock);
 	return result;
@@ -795,7 +795,7 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records,
 	 * First unlink the handle from its current transaction, and start the
 	 * commit on that.
 	 */
-	jbd_debug(2, "restarting handle %p\n", handle);
+	jbd2_debug(2, "restarting handle %p\n", handle);
 	stop_this_handle(handle);
 	handle->h_transaction = NULL;
 
@@ -979,7 +979,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
 
 	journal = transaction->t_journal;
 
-	jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
+	jbd2_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
 
 	JBUFFER_TRACE(jh, "entry");
 repeat:
@@ -1271,7 +1271,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
 	int err;
 
-	jbd_debug(5, "journal_head %p\n", jh);
+	jbd2_debug(5, "journal_head %p\n", jh);
 	err = -EROFS;
 	if (is_handle_aborted(handle))
 		goto out;
@@ -1486,8 +1486,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	struct journal_head *jh;
 	int ret = 0;
 
-	if (is_handle_aborted(handle))
-		return -EROFS;
 	if (!buffer_jbd(bh))
 		return -EUCLEAN;
 
@@ -1496,7 +1494,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	 * of the running transaction.
 	 */
 	jh = bh2jh(bh);
-	jbd_debug(5, "journal_head %p\n", jh);
+	jbd2_debug(5, "journal_head %p\n", jh);
 	JBUFFER_TRACE(jh, "entry");
 
 	/*
@@ -1534,6 +1532,18 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	journal = transaction->t_journal;
 	spin_lock(&jh->b_state_lock);
 
+	if (is_handle_aborted(handle)) {
+		/*
+		 * Check journal aborting with @jh->b_state_lock locked,
+		 * since 'jh->b_transaction' could be replaced with
+		 * 'jh->b_next_transaction' during old transaction
+		 * committing if journal aborted, which may fail
+		 * assertion on 'jh->b_frozen_data == NULL'.
+		 */
+		ret = -EROFS;
+		goto out_unlock_bh;
+	}
+
 	if (jh->b_modified == 0) {
 		/*
 		 * This buffer's got modified and becoming part
@@ -1818,7 +1828,7 @@ int jbd2_journal_stop(handle_t *handle)
 	pid_t pid;
 
 	if (--handle->h_ref > 0) {
-		jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+		jbd2_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
 						 handle->h_ref);
 		if (is_handle_aborted(handle))
 			return -EIO;
@@ -1838,7 +1848,7 @@ int jbd2_journal_stop(handle_t *handle)
 	if (is_handle_aborted(handle))
 		err = -EIO;
 
-	jbd_debug(4, "Handle %p going down\n", handle);
+	jbd2_debug(4, "Handle %p going down\n", handle);
 	trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
 				tid, handle->h_type, handle->h_line_no,
 				jiffies - handle->h_start_jiffies,
@@ -1916,7 +1926,7 @@ int jbd2_journal_stop(handle_t *handle)
 		 * completes the commit thread, it just doesn't write
 		 * anything to disk. */
 
-		jbd_debug(2, "transaction too old, requesting commit for "
+		jbd2_debug(2, "transaction too old, requesting commit for "
 					"handle %p\n", handle);
 		/* This is non-blocking */
 		jbd2_log_start_commit(journal, tid);
@@ -2662,7 +2672,7 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
 		return -EROFS;
 	journal = transaction->t_journal;
 
-	jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+	jbd2_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
 			transaction->t_tid);
 
 	spin_lock(&journal->j_list_lock);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 97c54d3a2227..96f1d49d30a5 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -11,7 +11,7 @@
 /*
  * Mbcache is a simple key-value store. Keys need not be unique, however
  * key-value pairs are expected to be unique (we use this fact in
- * mb_cache_entry_delete()).
+ * mb_cache_entry_delete_or_get()).
  *
  * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
  * Ext4 also uses it for deduplication of xattr values stored in inodes.
@@ -90,7 +90,7 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&entry->e_list);
-	/* One ref for hash, one ref returned */
+	/* Initial hash reference */
 	atomic_set(&entry->e_refcnt, 1);
 	entry->e_key = key;
 	entry->e_value = value;
@@ -106,25 +106,45 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
 		}
 	}
 	hlist_bl_add_head(&entry->e_hash_list, head);
-	hlist_bl_unlock(head);
-
+	/*
+	 * Add entry to LRU list before it can be found by
+	 * mb_cache_entry_delete() to avoid races
+	 */
 	spin_lock(&cache->c_list_lock);
 	list_add_tail(&entry->e_list, &cache->c_list);
-	/* Grab ref for LRU list */
-	atomic_inc(&entry->e_refcnt);
 	cache->c_entry_count++;
 	spin_unlock(&cache->c_list_lock);
+	hlist_bl_unlock(head);
 
 	return 0;
 }
 EXPORT_SYMBOL(mb_cache_entry_create);
 
-void __mb_cache_entry_free(struct mb_cache_entry *entry)
+void __mb_cache_entry_free(struct mb_cache *cache, struct mb_cache_entry *entry)
 {
+	struct hlist_bl_head *head;
+
+	head = mb_cache_entry_head(cache, entry->e_key);
+	hlist_bl_lock(head);
+	hlist_bl_del(&entry->e_hash_list);
+	hlist_bl_unlock(head);
 	kmem_cache_free(mb_entry_cache, entry);
 }
 EXPORT_SYMBOL(__mb_cache_entry_free);
 
+/*
+ * mb_cache_entry_wait_unused - wait to be the last user of the entry
+ *
+ * @entry - entry to work on
+ *
+ * Wait to be the last user of the entry.
+ */
+void mb_cache_entry_wait_unused(struct mb_cache_entry *entry)
+{
+	wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 2);
+}
+EXPORT_SYMBOL(mb_cache_entry_wait_unused);
+
 static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
 					   struct mb_cache_entry *entry,
 					   u32 key)
@@ -142,10 +162,9 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
 	while (node) {
 		entry = hlist_bl_entry(node, struct mb_cache_entry,
 				       e_hash_list);
-		if (entry->e_key == key && entry->e_reusable) {
-			atomic_inc(&entry->e_refcnt);
+		if (entry->e_key == key && entry->e_reusable &&
+		    atomic_inc_not_zero(&entry->e_refcnt))
 			goto out;
-		}
 		node = node->next;
 	}
 	entry = NULL;
@@ -205,10 +224,9 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
 	head = mb_cache_entry_head(cache, key);
 	hlist_bl_lock(head);
 	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
-		if (entry->e_key == key && entry->e_value == value) {
-			atomic_inc(&entry->e_refcnt);
+		if (entry->e_key == key && entry->e_value == value &&
+		    atomic_inc_not_zero(&entry->e_refcnt))
 			goto out;
-		}
 	}
 	entry = NULL;
 out:
@@ -217,42 +235,42 @@ out:
 }
 EXPORT_SYMBOL(mb_cache_entry_get);
 
-/* mb_cache_entry_delete - remove a cache entry
+/* mb_cache_entry_delete_or_get - remove a cache entry if it has no users
  * @cache - cache we work with
  * @key - key
  * @value - value
  *
- * Remove entry from cache @cache with key @key and value @value.
+ * Remove entry from cache @cache with key @key and value @value. The removal
+ * happens only if the entry is unused. The function returns NULL in case the
+ * entry was successfully removed or there's no entry in cache. Otherwise the
+ * function grabs reference of the entry that we failed to delete because it
+ * still has users and return it.
  */
-void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value)
+struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
+						    u32 key, u64 value)
 {
-	struct hlist_bl_node *node;
-	struct hlist_bl_head *head;
 	struct mb_cache_entry *entry;
 
-	head = mb_cache_entry_head(cache, key);
-	hlist_bl_lock(head);
-	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
-		if (entry->e_key == key && entry->e_value == value) {
-			/* We keep hash list reference to keep entry alive */
-			hlist_bl_del_init(&entry->e_hash_list);
-			hlist_bl_unlock(head);
-			spin_lock(&cache->c_list_lock);
-			if (!list_empty(&entry->e_list)) {
-				list_del_init(&entry->e_list);
-				if (!WARN_ONCE(cache->c_entry_count == 0,
-		"mbcache: attempt to decrement c_entry_count past zero"))
-					cache->c_entry_count--;
-				atomic_dec(&entry->e_refcnt);
-			}
-			spin_unlock(&cache->c_list_lock);
-			mb_cache_entry_put(cache, entry);
-			return;
-		}
-	}
-	hlist_bl_unlock(head);
+	entry = mb_cache_entry_get(cache, key, value);
+	if (!entry)
+		return NULL;
+
+	/*
+	 * Drop the ref we got from mb_cache_entry_get() and the initial hash
+	 * ref if we are the last user
+	 */
+	if (atomic_cmpxchg(&entry->e_refcnt, 2, 0) != 2)
+		return entry;
+
+	spin_lock(&cache->c_list_lock);
+	if (!list_empty(&entry->e_list))
+		list_del_init(&entry->e_list);
+	cache->c_entry_count--;
+	spin_unlock(&cache->c_list_lock);
+	__mb_cache_entry_free(cache, entry);
+	return NULL;
 }
-EXPORT_SYMBOL(mb_cache_entry_delete);
+EXPORT_SYMBOL(mb_cache_entry_delete_or_get);
 
 /* mb_cache_entry_touch - cache entry got used
  * @cache - cache the entry belongs to
@@ -281,34 +299,24 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
 				     unsigned long nr_to_scan)
 {
 	struct mb_cache_entry *entry;
-	struct hlist_bl_head *head;
 	unsigned long shrunk = 0;
 
 	spin_lock(&cache->c_list_lock);
 	while (nr_to_scan-- && !list_empty(&cache->c_list)) {
 		entry = list_first_entry(&cache->c_list,
 					 struct mb_cache_entry, e_list);
-		if (entry->e_referenced) {
+		/* Drop initial hash reference if there is no user */
+		if (entry->e_referenced ||
+		    atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) {
 			entry->e_referenced = 0;
 			list_move_tail(&entry->e_list, &cache->c_list);
 			continue;
 		}
 		list_del_init(&entry->e_list);
 		cache->c_entry_count--;
-		/*
-		 * We keep LRU list reference so that entry doesn't go away
-		 * from under us.
-		 */
 		spin_unlock(&cache->c_list_lock);
-		head = mb_cache_entry_head(cache, entry->e_key);
-		hlist_bl_lock(head);
-		if (!hlist_bl_unhashed(&entry->e_hash_list)) {
-			hlist_bl_del_init(&entry->e_hash_list);
-			atomic_dec(&entry->e_refcnt);
-		}
-		hlist_bl_unlock(head);
-		if (mb_cache_entry_put(cache, entry))
-			shrunk++;
+		__mb_cache_entry_free(cache, entry);
+		shrunk++;
 		cond_resched();
 		spin_lock(&cache->c_list_lock);
 	}
@@ -400,11 +408,6 @@ void mb_cache_destroy(struct mb_cache *cache)
 	 * point.
 	 */
 	list_for_each_entry_safe(entry, next, &cache->c_list, e_list) {
-		if (!hlist_bl_unhashed(&entry->e_hash_list)) {
-			hlist_bl_del_init(&entry->e_hash_list);
-			atomic_dec(&entry->e_refcnt);
-		} else
-			WARN_ON(1);
 		list_del(&entry->e_list);
 		WARN_ON(atomic_read(&entry->e_refcnt) != 1);
 		mb_cache_entry_put(cache, entry);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b056cfc6398e..1131dd01e4fe 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -106,6 +106,7 @@ xfs-y				+= xfs_log.o \
 				   xfs_icreate_item.o \
 				   xfs_inode_item.o \
 				   xfs_inode_item_recover.o \
+				   xfs_iunlink_item.o \
 				   xfs_refcount_item.o \
 				   xfs_rmap_item.o \
 				   xfs_log_recover.o \
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 3e920cf1b454..bb0c700afe3c 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -120,18 +120,18 @@ xfs_initialize_perag_data(
 
 	for (index = 0; index < agcount; index++) {
 		/*
-		 * read the agf, then the agi. This gets us
-		 * all the information we need and populates the
-		 * per-ag structures for us.
+		 * Read the AGF and AGI buffers to populate the per-ag
+		 * structures for us.
 		 */
-		error = xfs_alloc_pagf_init(mp, NULL, index, 0);
-		if (error)
-			return error;
-
-		error = xfs_ialloc_pagi_init(mp, NULL, index);
-		if (error)
-			return error;
 		pag = xfs_perag_get(mp, index);
+		error = xfs_alloc_read_agf(pag, NULL, 0, NULL);
+		if (!error)
+			error = xfs_ialloc_read_agi(pag, NULL, NULL);
+		if (error) {
+			xfs_perag_put(pag);
+			return error;
+		}
+
 		ifree += pag->pagi_freecount;
 		ialloc += pag->pagi_count;
 		bfree += pag->pagf_freeblks;
@@ -194,17 +194,76 @@ xfs_free_perag(
 		XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
 
 		cancel_delayed_work_sync(&pag->pag_blockgc_work);
-		xfs_iunlink_destroy(pag);
 		xfs_buf_hash_destroy(pag);
 
 		call_rcu(&pag->rcu_head, __xfs_free_perag);
 	}
 }
 
+/* Find the size of the AG, in blocks. */
+static xfs_agblock_t
+__xfs_ag_block_count(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_agnumber_t		agcount,
+	xfs_rfsblock_t		dblocks)
+{
+	ASSERT(agno < agcount);
+
+	if (agno < agcount - 1)
+		return mp->m_sb.sb_agblocks;
+	return dblocks - (agno * mp->m_sb.sb_agblocks);
+}
+
+xfs_agblock_t
+xfs_ag_block_count(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno)
+{
+	return __xfs_ag_block_count(mp, agno, mp->m_sb.sb_agcount,
+			mp->m_sb.sb_dblocks);
+}
+
+/* Calculate the first and last possible inode number in an AG. */
+static void
+__xfs_agino_range(
+	struct xfs_mount	*mp,
+	xfs_agblock_t		eoag,
+	xfs_agino_t		*first,
+	xfs_agino_t		*last)
+{
+	xfs_agblock_t		bno;
+
+	/*
+	 * Calculate the first inode, which will be in the first
+	 * cluster-aligned block after the AGFL.
+	 */
+	bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align);
+	*first = XFS_AGB_TO_AGINO(mp, bno);
+
+	/*
+	 * Calculate the last inode, which will be at the end of the
+	 * last (aligned) cluster that can be allocated in the AG.
+	 */
+	bno = round_down(eoag, M_IGEO(mp)->cluster_align);
+	*last = XFS_AGB_TO_AGINO(mp, bno) - 1;
+}
+
+void
+xfs_agino_range(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_agino_t		*first,
+	xfs_agino_t		*last)
+{
+	return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last);
+}
+
 int
 xfs_initialize_perag(
 	struct xfs_mount	*mp,
 	xfs_agnumber_t		agcount,
+	xfs_rfsblock_t		dblocks,
 	xfs_agnumber_t		*maxagi)
 {
 	struct xfs_perag	*pag;
@@ -263,13 +322,18 @@ xfs_initialize_perag(
 		if (error)
 			goto out_remove_pag;
 
-		error = xfs_iunlink_init(pag);
-		if (error)
-			goto out_hash_destroy;
-
 		/* first new pag is fully initialized */
 		if (first_initialised == NULLAGNUMBER)
 			first_initialised = index;
+
+		/*
+		 * Pre-calculated geometry
+		 */
+		pag->block_count = __xfs_ag_block_count(mp, index, agcount,
+				dblocks);
+		pag->min_block = XFS_AGFL_BLOCK(mp);
+		__xfs_agino_range(mp, pag->block_count, &pag->agino_min,
+				&pag->agino_max);
 	}
 
 	index = xfs_set_inode_alloc(mp, agcount);
@@ -280,8 +344,6 @@ xfs_initialize_perag(
 	mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
 	return 0;
 
-out_hash_destroy:
-	xfs_buf_hash_destroy(pag);
 out_remove_pag:
 	radix_tree_delete(&mp->m_perag_tree, index);
 out_free_pag:
@@ -293,7 +355,6 @@ out_unwind_new_pags:
 		if (!pag)
 			break;
 		xfs_buf_hash_destroy(pag);
-		xfs_iunlink_destroy(pag);
 		kmem_free(pag);
 	}
 	return error;
@@ -321,12 +382,6 @@ xfs_get_aghdr_buf(
 	return 0;
 }
 
-static inline bool is_log_ag(struct xfs_mount *mp, struct aghdr_init_data *id)
-{
-	return mp->m_sb.sb_logstart > 0 &&
-	       id->agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
-}
-
 /*
  * Generic btree root block init function
  */
@@ -352,7 +407,7 @@ xfs_freesp_init_recs(
 	arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
 	arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
 
-	if (is_log_ag(mp, id)) {
+	if (xfs_ag_contains_log(mp, id->agno)) {
 		struct xfs_alloc_rec	*nrec;
 		xfs_agblock_t		start = XFS_FSB_TO_AGBNO(mp,
 							mp->m_sb.sb_logstart);
@@ -479,7 +534,7 @@ xfs_rmaproot_init(
 	}
 
 	/* account for the log space */
-	if (is_log_ag(mp, id)) {
+	if (xfs_ag_contains_log(mp, id->agno)) {
 		rrec = XFS_RMAP_REC_ADDR(block,
 				be16_to_cpu(block->bb_numrecs) + 1);
 		rrec->rm_startblock = cpu_to_be32(
@@ -550,7 +605,7 @@ xfs_agfblock_init(
 		agf->agf_refcount_blocks = cpu_to_be32(1);
 	}
 
-	if (is_log_ag(mp, id)) {
+	if (xfs_ag_contains_log(mp, id->agno)) {
 		int64_t	logblocks = mp->m_sb.sb_logblocks;
 
 		be32_add_cpu(&agf->agf_freeblks, -logblocks);
@@ -761,11 +816,11 @@ xfs_ag_init_headers(
 
 int
 xfs_ag_shrink_space(
-	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
 	struct xfs_trans	**tpp,
-	xfs_agnumber_t		agno,
 	xfs_extlen_t		delta)
 {
+	struct xfs_mount	*mp = pag->pag_mount;
 	struct xfs_alloc_arg	args = {
 		.tp	= *tpp,
 		.mp	= mp,
@@ -782,14 +837,14 @@ xfs_ag_shrink_space(
 	xfs_agblock_t		aglen;
 	int			error, err2;
 
-	ASSERT(agno == mp->m_sb.sb_agcount - 1);
-	error = xfs_ialloc_read_agi(mp, *tpp, agno, &agibp);
+	ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1);
+	error = xfs_ialloc_read_agi(pag, *tpp, &agibp);
 	if (error)
 		return error;
 
 	agi = agibp->b_addr;
 
-	error = xfs_alloc_read_agf(mp, *tpp, agno, 0, &agfbp);
+	error = xfs_alloc_read_agf(pag, *tpp, 0, &agfbp);
 	if (error)
 		return error;
 
@@ -801,13 +856,14 @@ xfs_ag_shrink_space(
 	if (delta >= aglen)
 		return -EINVAL;
 
-	args.fsbno = XFS_AGB_TO_FSB(mp, agno, aglen - delta);
+	args.fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta);
 
 	/*
 	 * Make sure that the last inode cluster cannot overlap with the new
 	 * end of the AG, even if it's sparse.
 	 */
-	error = xfs_ialloc_check_shrink(*tpp, agno, agibp, aglen - delta);
+	error = xfs_ialloc_check_shrink(*tpp, pag->pag_agno, agibp,
+			aglen - delta);
 	if (error)
 		return error;
 
@@ -815,7 +871,7 @@ xfs_ag_shrink_space(
 	 * Disable perag reservations so it doesn't cause the allocation request
 	 * to fail. We'll reestablish reservation before we return.
 	 */
-	error = xfs_ag_resv_free(agibp->b_pag);
+	error = xfs_ag_resv_free(pag);
 	if (error)
 		return error;
 
@@ -844,7 +900,7 @@ xfs_ag_shrink_space(
 	be32_add_cpu(&agi->agi_length, -delta);
 	be32_add_cpu(&agf->agf_length, -delta);
 
-	err2 = xfs_ag_resv_init(agibp->b_pag, *tpp);
+	err2 = xfs_ag_resv_init(pag, *tpp);
 	if (err2) {
 		be32_add_cpu(&agi->agi_length, delta);
 		be32_add_cpu(&agf->agf_length, delta);
@@ -868,8 +924,9 @@ xfs_ag_shrink_space(
 	xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH);
 	xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH);
 	return 0;
+
 resv_init_out:
-	err2 = xfs_ag_resv_init(agibp->b_pag, *tpp);
+	err2 = xfs_ag_resv_init(pag, *tpp);
 	if (!err2)
 		return error;
 resv_err:
@@ -883,9 +940,8 @@ resv_err:
  */
 int
 xfs_ag_extend_space(
-	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
 	struct xfs_trans	*tp,
-	struct aghdr_init_data	*id,
 	xfs_extlen_t		len)
 {
 	struct xfs_buf		*bp;
@@ -893,23 +949,20 @@ xfs_ag_extend_space(
 	struct xfs_agf		*agf;
 	int			error;
 
-	/*
-	 * Change the agi length.
-	 */
-	error = xfs_ialloc_read_agi(mp, tp, id->agno, &bp);
+	ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1);
+
+	error = xfs_ialloc_read_agi(pag, tp, &bp);
 	if (error)
 		return error;
 
 	agi = bp->b_addr;
 	be32_add_cpu(&agi->agi_length, len);
-	ASSERT(id->agno == mp->m_sb.sb_agcount - 1 ||
-	       be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
 	xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
 
 	/*
 	 * Change agf length.
 	 */
-	error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &bp);
+	error = xfs_alloc_read_agf(pag, tp, 0, &bp);
 	if (error)
 		return error;
 
@@ -924,49 +977,49 @@ xfs_ag_extend_space(
 	 * XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that
 	 * this doesn't actually exist in the rmap btree.
 	 */
-	error = xfs_rmap_free(tp, bp, bp->b_pag,
-				be32_to_cpu(agf->agf_length) - len,
+	error = xfs_rmap_free(tp, bp, pag, be32_to_cpu(agf->agf_length) - len,
 				len, &XFS_RMAP_OINFO_SKIP_UPDATE);
 	if (error)
 		return error;
 
-	return  xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno,
+	error = xfs_free_extent(tp, XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno,
 					be32_to_cpu(agf->agf_length) - len),
 				len, &XFS_RMAP_OINFO_SKIP_UPDATE,
 				XFS_AG_RESV_NONE);
+	if (error)
+		return error;
+
+	/* Update perag geometry */
+	pag->block_count = be32_to_cpu(agf->agf_length);
+	__xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min,
+				&pag->agino_max);
+	return 0;
 }
 
 /* Retrieve AG geometry. */
 int
 xfs_ag_get_geometry(
-	struct xfs_mount	*mp,
-	xfs_agnumber_t		agno,
+	struct xfs_perag	*pag,
 	struct xfs_ag_geometry	*ageo)
 {
 	struct xfs_buf		*agi_bp;
 	struct xfs_buf		*agf_bp;
 	struct xfs_agi		*agi;
 	struct xfs_agf		*agf;
-	struct xfs_perag	*pag;
 	unsigned int		freeblks;
 	int			error;
 
-	if (agno >= mp->m_sb.sb_agcount)
-		return -EINVAL;
-
 	/* Lock the AG headers. */
-	error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp);
+	error = xfs_ialloc_read_agi(pag, NULL, &agi_bp);
 	if (error)
 		return error;
-	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp);
+	error = xfs_alloc_read_agf(pag, NULL, 0, &agf_bp);
 	if (error)
 		goto out_agi;
 
-	pag = agi_bp->b_pag;
-
 	/* Fill out form. */
 	memset(ageo, 0, sizeof(*ageo));
-	ageo->ag_number = agno;
+	ageo->ag_number = pag->pag_agno;
 
 	agi = agi_bp->b_addr;
 	ageo->ag_icount = be32_to_cpu(agi->agi_count);
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index e411d51c2589..517a138faa66 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -67,6 +67,12 @@ struct xfs_perag {
 	/* for rcu-safe freeing */
 	struct rcu_head	rcu_head;
 
+	/* Precalculated geometry info */
+	xfs_agblock_t		block_count;
+	xfs_agblock_t		min_block;
+	xfs_agino_t		agino_min;
+	xfs_agino_t		agino_max;
+
 #ifdef __KERNEL__
 	/* -- kernel only structures below this line -- */
 
@@ -97,17 +103,11 @@ struct xfs_perag {
 	/* background prealloc block trimming */
 	struct delayed_work	pag_blockgc_work;
 
-	/*
-	 * Unlinked inode information.  This incore information reflects
-	 * data stored in the AGI, so callers must hold the AGI buffer lock
-	 * or have some other means to control concurrency.
-	 */
-	struct rhashtable	pagi_unlinked_hash;
 #endif /* __KERNEL__ */
 };
 
 int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount,
-			xfs_agnumber_t *maxagi);
+			xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi);
 int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno);
 void xfs_free_perag(struct xfs_mount *mp);
 
@@ -116,6 +116,56 @@ struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
 		unsigned int tag);
 void xfs_perag_put(struct xfs_perag *pag);
 
+/*
+ * Per-ag geometry infomation and validation
+ */
+xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno);
+void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
+		xfs_agino_t *first, xfs_agino_t *last);
+
+static inline bool
+xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno)
+{
+	if (agbno >= pag->block_count)
+		return false;
+	if (agbno <= pag->min_block)
+		return false;
+	return true;
+}
+
+/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata.
+ */
+static inline bool
+xfs_verify_agino(struct xfs_perag *pag, xfs_agino_t agino)
+{
+	if (agino < pag->agino_min)
+		return false;
+	if (agino > pag->agino_max)
+		return false;
+	return true;
+}
+
+/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata, or is NULLAGINO.
+ */
+static inline bool
+xfs_verify_agino_or_null(struct xfs_perag *pag, xfs_agino_t agino)
+{
+	if (agino == NULLAGINO)
+		return true;
+	return xfs_verify_agino(pag, agino);
+}
+
+static inline bool
+xfs_ag_contains_log(struct xfs_mount *mp, xfs_agnumber_t agno)
+{
+	return mp->m_sb.sb_logstart > 0 &&
+	       agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
+}
+
 /*
  * Perag iteration APIs
  */
@@ -168,11 +218,10 @@ struct aghdr_init_data {
 };
 
 int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
-int xfs_ag_shrink_space(struct xfs_mount *mp, struct xfs_trans **tpp,
-			xfs_agnumber_t agno, xfs_extlen_t delta);
-int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp,
-			struct aghdr_init_data *id, xfs_extlen_t len);
-int xfs_ag_get_geometry(struct xfs_mount *mp, xfs_agnumber_t agno,
-			struct xfs_ag_geometry *ageo);
+int xfs_ag_shrink_space(struct xfs_perag *pag, struct xfs_trans **tpp,
+			xfs_extlen_t delta);
+int xfs_ag_extend_space(struct xfs_perag *pag, struct xfs_trans *tp,
+			xfs_extlen_t len);
+int xfs_ag_get_geometry(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
 
 #endif /* __LIBXFS_AG_H */
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index fe94058d4e9e..5af123d13a63 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -322,7 +322,7 @@ out:
 	 * address.
 	 */
 	if (has_resv) {
-		error2 = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, 0);
+		error2 = xfs_alloc_read_agf(pag, tp, 0, NULL);
 		if (error2)
 			return error2;
 
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index d3f2886fdc08..e2bdf089c0a3 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -84,7 +84,7 @@ xfs_prealloc_blocks(
 /*
  * The number of blocks per AG that we withhold from xfs_mod_fdblocks to
  * guarantee that we can refill the AGFL prior to allocating space in a nearly
- * full AG.  Although the the space described by the free space btrees, the
+ * full AG.  Although the space described by the free space btrees, the
  * blocks used by the freesp btrees themselves, and the blocks owned by the
  * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
  * free space in the AG drop so low that the free space btrees cannot refill an
@@ -248,7 +248,7 @@ xfs_alloc_get_rec(
 	int			*stat)	/* output: success/failure */
 {
 	struct xfs_mount	*mp = cur->bc_mp;
-	xfs_agnumber_t		agno = cur->bc_ag.pag->pag_agno;
+	struct xfs_perag	*pag = cur->bc_ag.pag;
 	union xfs_btree_rec	*rec;
 	int			error;
 
@@ -263,11 +263,11 @@ xfs_alloc_get_rec(
 		goto out_bad_rec;
 
 	/* check for valid extent range, including overflow */
-	if (!xfs_verify_agbno(mp, agno, *bno))
+	if (!xfs_verify_agbno(pag, *bno))
 		goto out_bad_rec;
 	if (*bno > *bno + *len)
 		goto out_bad_rec;
-	if (!xfs_verify_agbno(mp, agno, *bno + *len - 1))
+	if (!xfs_verify_agbno(pag, *bno + *len - 1))
 		goto out_bad_rec;
 
 	return 0;
@@ -275,7 +275,8 @@ xfs_alloc_get_rec(
 out_bad_rec:
 	xfs_warn(mp,
 		"%s Freespace BTree record corruption in AG %d detected!",
-		cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", agno);
+		cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
+		pag->pag_agno);
 	xfs_warn(mp,
 		"start block 0x%x block count 0x%x", *bno, *len);
 	return -EFSCORRUPTED;
@@ -703,20 +704,19 @@ const struct xfs_buf_ops xfs_agfl_buf_ops = {
 /*
  * Read in the allocation group free block array.
  */
-int					/* error */
+int
 xfs_alloc_read_agfl(
-	xfs_mount_t	*mp,		/* mount point structure */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_agnumber_t	agno,		/* allocation group number */
-	struct xfs_buf	**bpp)		/* buffer for the ag free block array */
+	struct xfs_perag	*pag,
+	struct xfs_trans	*tp,
+	struct xfs_buf		**bpp)
 {
-	struct xfs_buf	*bp;		/* return value */
-	int		error;
+	struct xfs_mount	*mp = pag->pag_mount;
+	struct xfs_buf		*bp;
+	int			error;
 
-	ASSERT(agno != NULLAGNUMBER);
 	error = xfs_trans_read_buf(
 			mp, tp, mp->m_ddev_targp,
-			XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
 			XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
 	if (error)
 		return error;
@@ -1075,7 +1075,8 @@ xfs_alloc_ag_vextent_small(
 	    be32_to_cpu(agf->agf_flcount) <= args->minleft)
 		goto out;
 
-	error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
+	error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp,
+			&fbno, 0);
 	if (error)
 		goto error;
 	if (fbno == NULLAGBLOCK)
@@ -2609,7 +2610,7 @@ xfs_alloc_fix_freelist(
 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 
 	if (!pag->pagf_init) {
-		error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+		error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
 		if (error) {
 			/* Couldn't lock the AGF so skip this AG. */
 			if (error == -EAGAIN)
@@ -2639,7 +2640,7 @@ xfs_alloc_fix_freelist(
 	 * Can fail if we're not blocking on locks, and it's held.
 	 */
 	if (!agbp) {
-		error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+		error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
 		if (error) {
 			/* Couldn't lock the AGF so skip this AG. */
 			if (error == -EAGAIN)
@@ -2697,7 +2698,7 @@ xfs_alloc_fix_freelist(
 	else
 		targs.oinfo = XFS_RMAP_OINFO_AG;
 	while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
-		error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
+		error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0);
 		if (error)
 			goto out_agbp_relse;
 
@@ -2712,7 +2713,7 @@ xfs_alloc_fix_freelist(
 	targs.alignment = targs.minlen = targs.prod = 1;
 	targs.type = XFS_ALLOCTYPE_THIS_AG;
 	targs.pag = pag;
-	error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
+	error = xfs_alloc_read_agfl(pag, tp, &agflbp);
 	if (error)
 		goto out_agbp_relse;
 
@@ -2741,7 +2742,7 @@ xfs_alloc_fix_freelist(
 		 * Put each allocated block on the list.
 		 */
 		for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
-			error = xfs_alloc_put_freelist(tp, agbp,
+			error = xfs_alloc_put_freelist(pag, tp, agbp,
 							agflbp, bno, 0);
 			if (error)
 				goto out_agflbp_relse;
@@ -2767,6 +2768,7 @@ out_no_agbp:
  */
 int
 xfs_alloc_get_freelist(
+	struct xfs_perag	*pag,
 	struct xfs_trans	*tp,
 	struct xfs_buf		*agbp,
 	xfs_agblock_t		*bnop,
@@ -2779,7 +2781,6 @@ xfs_alloc_get_freelist(
 	int			error;
 	uint32_t		logflags;
 	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_perag	*pag;
 
 	/*
 	 * Freelist is empty, give up.
@@ -2791,8 +2792,7 @@ xfs_alloc_get_freelist(
 	/*
 	 * Read the array of free blocks.
 	 */
-	error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
-				    &agflbp);
+	error = xfs_alloc_read_agfl(pag, tp, &agflbp);
 	if (error)
 		return error;
 
@@ -2807,7 +2807,6 @@ xfs_alloc_get_freelist(
 	if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
 		agf->agf_flfirst = 0;
 
-	pag = agbp->b_pag;
 	ASSERT(!pag->pagf_agflreset);
 	be32_add_cpu(&agf->agf_flcount, -1);
 	pag->pagf_flcount--;
@@ -2867,30 +2866,12 @@ xfs_alloc_log_agf(
 	xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
 }
 
-/*
- * Interface for inode allocation to force the pag data to be initialized.
- */
-int					/* error */
-xfs_alloc_pagf_init(
-	xfs_mount_t		*mp,	/* file system mount structure */
-	xfs_trans_t		*tp,	/* transaction pointer */
-	xfs_agnumber_t		agno,	/* allocation group number */
-	int			flags)	/* XFS_ALLOC_FLAGS_... */
-{
-	struct xfs_buf		*bp;
-	int			error;
-
-	error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp);
-	if (!error)
-		xfs_trans_brelse(tp, bp);
-	return error;
-}
-
 /*
  * Put the block on the freelist for the allocation group.
  */
 int
 xfs_alloc_put_freelist(
+	struct xfs_perag	*pag,
 	struct xfs_trans	*tp,
 	struct xfs_buf		*agbp,
 	struct xfs_buf		*agflbp,
@@ -2899,21 +2880,22 @@ xfs_alloc_put_freelist(
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 	struct xfs_agf		*agf = agbp->b_addr;
-	struct xfs_perag	*pag;
 	__be32			*blockp;
 	int			error;
 	uint32_t		logflags;
 	__be32			*agfl_bno;
 	int			startoff;
 
-	if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
-			be32_to_cpu(agf->agf_seqno), &agflbp)))
-		return error;
+	if (!agflbp) {
+		error = xfs_alloc_read_agfl(pag, tp, &agflbp);
+		if (error)
+			return error;
+	}
+
 	be32_add_cpu(&agf->agf_fllast, 1);
 	if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
 		agf->agf_fllast = 0;
 
-	pag = agbp->b_pag;
 	ASSERT(!pag->pagf_agflreset);
 	be32_add_cpu(&agf->agf_flcount, 1);
 	pag->pagf_flcount++;
@@ -3070,61 +3052,57 @@ const struct xfs_buf_ops xfs_agf_buf_ops = {
 /*
  * Read in the allocation group header (free/alloc section).
  */
-int					/* error */
+int
 xfs_read_agf(
-	struct xfs_mount	*mp,	/* mount point structure */
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_agnumber_t		agno,	/* allocation group number */
-	int			flags,	/* XFS_BUF_ */
-	struct xfs_buf		**bpp)	/* buffer for the ag freelist header */
+	struct xfs_perag	*pag,
+	struct xfs_trans	*tp,
+	int			flags,
+	struct xfs_buf		**agfbpp)
 {
-	int		error;
+	struct xfs_mount	*mp = pag->pag_mount;
+	int			error;
 
-	trace_xfs_read_agf(mp, agno);
+	trace_xfs_read_agf(pag->pag_mount, pag->pag_agno);
 
-	ASSERT(agno != NULLAGNUMBER);
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-			XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-			XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
+			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
 	if (error)
 		return error;
 
-	ASSERT(!(*bpp)->b_error);
-	xfs_buf_set_ref(*bpp, XFS_AGF_REF);
+	xfs_buf_set_ref(*agfbpp, XFS_AGF_REF);
 	return 0;
 }
 
 /*
- * Read in the allocation group header (free/alloc section).
+ * Read in the allocation group header (free/alloc section) and initialise the
+ * perag structure if necessary. If the caller provides @agfbpp, then return the
+ * locked buffer to the caller, otherwise free it.
  */
-int					/* error */
+int
 xfs_alloc_read_agf(
-	struct xfs_mount	*mp,	/* mount point structure */
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_agnumber_t		agno,	/* allocation group number */
-	int			flags,	/* XFS_ALLOC_FLAG_... */
-	struct xfs_buf		**bpp)	/* buffer for the ag freelist header */
+	struct xfs_perag	*pag,
+	struct xfs_trans	*tp,
+	int			flags,
+	struct xfs_buf		**agfbpp)
 {
-	struct xfs_agf		*agf;		/* ag freelist header */
-	struct xfs_perag	*pag;		/* per allocation group data */
+	struct xfs_buf		*agfbp;
+	struct xfs_agf		*agf;
 	int			error;
 	int			allocbt_blks;
 
-	trace_xfs_alloc_read_agf(mp, agno);
+	trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno);
 
 	/* We don't support trylock when freeing. */
 	ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
 			(XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
-	ASSERT(agno != NULLAGNUMBER);
-	error = xfs_read_agf(mp, tp, agno,
+	error = xfs_read_agf(pag, tp,
 			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
-			bpp);
+			&agfbp);
 	if (error)
 		return error;
-	ASSERT(!(*bpp)->b_error);
 
-	agf = (*bpp)->b_addr;
-	pag = (*bpp)->b_pag;
+	agf = agfbp->b_addr;
 	if (!pag->pagf_init) {
 		pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
 		pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
@@ -3138,7 +3116,7 @@ xfs_alloc_read_agf(
 			be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
 		pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
 		pag->pagf_init = 1;
-		pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf);
+		pag->pagf_agflreset = xfs_agfl_needs_reset(pag->pag_mount, agf);
 
 		/*
 		 * Update the in-core allocbt counter. Filter out the rmapbt
@@ -3148,13 +3126,14 @@ xfs_alloc_read_agf(
 		 * counter only tracks non-root blocks.
 		 */
 		allocbt_blks = pag->pagf_btreeblks;
-		if (xfs_has_rmapbt(mp))
+		if (xfs_has_rmapbt(pag->pag_mount))
 			allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
 		if (allocbt_blks > 0)
-			atomic64_add(allocbt_blks, &mp->m_allocbt_blks);
+			atomic64_add(allocbt_blks,
+					&pag->pag_mount->m_allocbt_blks);
 	}
 #ifdef DEBUG
-	else if (!xfs_is_shutdown(mp)) {
+	else if (!xfs_is_shutdown(pag->pag_mount)) {
 		ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
 		ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
 		ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
@@ -3165,6 +3144,10 @@ xfs_alloc_read_agf(
 		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
 	}
 #endif
+	if (agfbpp)
+		*agfbpp = agfbp;
+	else
+		xfs_trans_brelse(tp, agfbp);
 	return 0;
 }
 
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 84ca09b2223f..2c3f762dfb58 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -95,6 +95,11 @@ xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_perag *pag,
 		xfs_extlen_t need, xfs_extlen_t reserved);
 unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
 		struct xfs_perag *pag);
+int xfs_alloc_get_freelist(struct xfs_perag *pag, struct xfs_trans *tp,
+		struct xfs_buf *agfbp, xfs_agblock_t *bnop, int	 btreeblk);
+int xfs_alloc_put_freelist(struct xfs_perag *pag, struct xfs_trans *tp,
+		struct xfs_buf *agfbp, struct xfs_buf *agflbp,
+		xfs_agblock_t bno, int btreeblk);
 
 /*
  * Compute and fill in value of m_alloc_maxlevels.
@@ -103,17 +108,6 @@ void
 xfs_alloc_compute_maxlevels(
 	struct xfs_mount	*mp);	/* file system mount structure */
 
-/*
- * Get a block from the freelist.
- * Returns with the buffer for the block gotten.
- */
-int				/* error */
-xfs_alloc_get_freelist(
-	struct xfs_trans *tp,	/* transaction pointer */
-	struct xfs_buf	*agbp,	/* buffer containing the agf structure */
-	xfs_agblock_t	*bnop,	/* block address retrieved from freelist */
-	int		btreeblk); /* destination is a AGF btree */
-
 /*
  * Log the given fields from the agf structure.
  */
@@ -123,38 +117,6 @@ xfs_alloc_log_agf(
 	struct xfs_buf	*bp,	/* buffer for a.g. freelist header */
 	uint32_t	fields);/* mask of fields to be logged (XFS_AGF_...) */
 
-/*
- * Interface for inode allocation to force the pag data to be initialized.
- */
-int				/* error */
-xfs_alloc_pagf_init(
-	struct xfs_mount *mp,	/* file system mount structure */
-	struct xfs_trans *tp,	/* transaction pointer */
-	xfs_agnumber_t	agno,	/* allocation group number */
-	int		flags);	/* XFS_ALLOC_FLAGS_... */
-
-/*
- * Put the block on the freelist for the allocation group.
- */
-int				/* error */
-xfs_alloc_put_freelist(
-	struct xfs_trans *tp,	/* transaction pointer */
-	struct xfs_buf	*agbp,	/* buffer for a.g. freelist header */
-	struct xfs_buf	*agflbp,/* buffer for a.g. free block array */
-	xfs_agblock_t	bno,	/* block being freed */
-	int		btreeblk); /* owner was a AGF btree */
-
-/*
- * Read in the allocation group header (free/alloc section).
- */
-int					/* error  */
-xfs_alloc_read_agf(
-	struct xfs_mount *mp,		/* mount point structure */
-	struct xfs_trans *tp,		/* transaction pointer */
-	xfs_agnumber_t	agno,		/* allocation group number */
-	int		flags,		/* XFS_ALLOC_FLAG_... */
-	struct xfs_buf	**bpp);		/* buffer for the ag freelist header */
-
 /*
  * Allocate an extent (variable-size).
  */
@@ -206,10 +168,12 @@ xfs_alloc_get_rec(
 	xfs_extlen_t		*len,	/* output: length of extent */
 	int			*stat);	/* output: success/failure */
 
-int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
-			xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
-int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp,
-			xfs_agnumber_t agno, struct xfs_buf **bpp);
+int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
+		struct xfs_buf **agfbpp);
+int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
+		struct xfs_buf **agfbpp);
+int xfs_alloc_read_agfl(struct xfs_perag *pag, struct xfs_trans *tp,
+		struct xfs_buf **bpp);
 int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t,
 			struct xfs_buf *, struct xfs_owner_info *);
 int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 8c9f73cc0bee..549a3cba0234 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -60,8 +60,8 @@ xfs_allocbt_alloc_block(
 	xfs_agblock_t		bno;
 
 	/* Allocate the new block from the freelist. If we can't, give up.  */
-	error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp,
-				       &bno, 1);
+	error = xfs_alloc_get_freelist(cur->bc_ag.pag, cur->bc_tp,
+			cur->bc_ag.agbp, &bno, 1);
 	if (error)
 		return error;
 
@@ -71,7 +71,7 @@ xfs_allocbt_alloc_block(
 	}
 
 	atomic64_inc(&cur->bc_mp->m_allocbt_blks);
-	xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agbp->b_pag, bno, 1, false);
+	xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.pag, bno, 1, false);
 
 	new->s = cpu_to_be32(bno);
 
@@ -89,7 +89,8 @@ xfs_allocbt_free_block(
 	int			error;
 
 	bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
-	error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
+	error = xfs_alloc_put_freelist(cur->bc_ag.pag, cur->bc_tp, agbp, NULL,
+			bno, 1);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 224649a76cbb..e28d93d232de 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -67,12 +67,10 @@ int
 xfs_inode_hasattr(
 	struct xfs_inode	*ip)
 {
-	if (!XFS_IFORK_Q(ip))
+	if (!xfs_inode_has_attr_fork(ip))
 		return 0;
-	if (!ip->i_afp)
-		return 0;
-	if (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
-	    ip->i_afp->if_nextents == 0)
+	if (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS &&
+	    ip->i_af.if_nextents == 0)
 		return 0;
 	return 1;
 }
@@ -85,7 +83,7 @@ bool
 xfs_attr_is_leaf(
 	struct xfs_inode	*ip)
 {
-	struct xfs_ifork	*ifp = ip->i_afp;
+	struct xfs_ifork	*ifp = &ip->i_af;
 	struct xfs_iext_cursor	icur;
 	struct xfs_bmbt_irec	imap;
 
@@ -231,7 +229,7 @@ xfs_attr_get_ilocked(
 	if (!xfs_inode_hasattr(args->dp))
 		return -ENOATTR;
 
-	if (args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL)
+	if (args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
 		return xfs_attr_shortform_getvalue(args);
 	if (xfs_attr_is_leaf(args->dp))
 		return xfs_attr_leaf_get(args);
@@ -354,7 +352,7 @@ xfs_attr_try_sf_addname(
 	/*
 	 * Build initial attribute list (if required).
 	 */
-	if (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS)
+	if (dp->i_af.if_format == XFS_DINODE_FMT_EXTENTS)
 		xfs_attr_shortform_create(args);
 
 	error = xfs_attr_shortform_addname(args);
@@ -864,7 +862,7 @@ xfs_attr_lookup(
 	if (!xfs_inode_hasattr(dp))
 		return -ENOATTR;
 
-	if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL)
+	if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
 		return xfs_attr_sf_findname(args, NULL, NULL);
 
 	if (xfs_attr_is_leaf(dp)) {
@@ -1001,7 +999,7 @@ xfs_attr_set(
 		 * If the inode doesn't have an attribute fork, add one.
 		 * (inode must not be locked when we call this routine)
 		 */
-		if (XFS_IFORK_Q(dp) == 0) {
+		if (xfs_inode_has_attr_fork(dp) == 0) {
 			int sf_size = sizeof(struct xfs_attr_sf_hdr) +
 				xfs_attr_sf_entsize_byname(args->namelen,
 						args->valuelen);
@@ -1101,7 +1099,7 @@ static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
 {
 	struct xfs_attr_shortform *sf;
 
-	sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
+	sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
 	return be16_to_cpu(sf->hdr.totsize);
 }
 
@@ -1558,7 +1556,7 @@ xfs_attr_node_get(
 	 * If not in a transaction, we have to release all the buffers.
 	 */
 out_release:
-	for (i = 0; state != NULL && i < state->path.active; i++) {
+	for (i = 0; i < state->path.active; i++) {
 		xfs_trans_brelse(args->trans, state->path.blk[i].bp);
 		state->path.blk[i].bp = NULL;
 	}
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index dfb47fa63c6d..81be9b3e4004 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -560,9 +560,9 @@ static inline bool
 xfs_attr_is_shortform(
 	struct xfs_inode    *ip)
 {
-	return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL ||
-	       (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
-		ip->i_afp->if_nextents == 0);
+	return ip->i_af.if_format == XFS_DINODE_FMT_LOCAL ||
+	       (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS &&
+		ip->i_af.if_nextents == 0);
 }
 
 static inline enum xfs_delattr_state
@@ -573,10 +573,10 @@ xfs_attr_init_add_state(struct xfs_da_args *args)
 	 * next state, the attribute fork may be null. This can occur only occur
 	 * on a pure remove, but we grab the next state before we check if a
 	 * replace operation is being performed. If we are called from any other
-	 * context, i_afp is guaranteed to exist. Hence if the attr fork is
+	 * context, i_af is guaranteed to exist. Hence if the attr fork is
 	 * null, we were called from a pure remove operation and so we are done.
 	 */
-	if (!args->dp->i_afp)
+	if (!xfs_inode_has_attr_fork(args->dp))
 		return XFS_DAS_DONE;
 
 	args->op_flags |= XFS_DA_OP_ADDNAME;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 8f47396f8dd2..beee51ad75ce 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -590,7 +590,7 @@ xfs_attr_shortform_bytesfit(
 	 * to real extents, or the delalloc conversion will take care of the
 	 * literal area rebalancing.
 	 */
-	if (bytes <= XFS_IFORK_ASIZE(dp))
+	if (bytes <= xfs_inode_attr_fork_size(dp))
 		return dp->i_forkoff;
 
 	/*
@@ -682,7 +682,7 @@ xfs_attr_shortform_create(
 	struct xfs_da_args	*args)
 {
 	struct xfs_inode	*dp = args->dp;
-	struct xfs_ifork	*ifp = dp->i_afp;
+	struct xfs_ifork	*ifp = &dp->i_af;
 	struct xfs_attr_sf_hdr	*hdr;
 
 	trace_xfs_attr_sf_create(args);
@@ -719,7 +719,7 @@ xfs_attr_sf_findname(
 	int			end;
 	int			i;
 
-	sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data;
+	sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
 	sfe = &sf->list[0];
 	end = sf->hdr.count;
 	for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe),
@@ -764,7 +764,7 @@ xfs_attr_shortform_add(
 	mp = dp->i_mount;
 	dp->i_forkoff = forkoff;
 
-	ifp = dp->i_afp;
+	ifp = &dp->i_af;
 	ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
 	sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
 	if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST)
@@ -797,11 +797,9 @@ xfs_attr_fork_remove(
 	struct xfs_inode	*ip,
 	struct xfs_trans	*tp)
 {
-	ASSERT(ip->i_afp->if_nextents == 0);
+	ASSERT(ip->i_af.if_nextents == 0);
 
-	xfs_idestroy_fork(ip->i_afp);
-	kmem_cache_free(xfs_ifork_cache, ip->i_afp);
-	ip->i_afp = NULL;
+	xfs_ifork_zap_attr(ip);
 	ip->i_forkoff = 0;
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }
@@ -825,7 +823,7 @@ xfs_attr_sf_removename(
 
 	dp = args->dp;
 	mp = dp->i_mount;
-	sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
+	sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
 
 	error = xfs_attr_sf_findname(args, &sfe, &base);
 
@@ -889,7 +887,7 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
 
 	trace_xfs_attr_sf_lookup(args);
 
-	ifp = args->dp->i_afp;
+	ifp = &args->dp->i_af;
 	ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
 	sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
 	sfe = &sf->list[0];
@@ -917,8 +915,8 @@ xfs_attr_shortform_getvalue(
 	struct xfs_attr_sf_entry *sfe;
 	int			i;
 
-	ASSERT(args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL);
-	sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data;
+	ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL);
+	sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
 	sfe = &sf->list[0];
 	for (i = 0; i < sf->hdr.count;
 				sfe = xfs_attr_sf_nextentry(sfe), i++) {
@@ -948,7 +946,7 @@ xfs_attr_shortform_to_leaf(
 	trace_xfs_attr_sf_to_leaf(args);
 
 	dp = args->dp;
-	ifp = dp->i_afp;
+	ifp = &dp->i_af;
 	sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
 	size = be16_to_cpu(sf->hdr.totsize);
 	tmpbuffer = kmem_alloc(size, 0);
@@ -1055,8 +1053,8 @@ xfs_attr_shortform_verify(
 	int				i;
 	int64_t				size;
 
-	ASSERT(ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL);
-	ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
+	ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL);
+	ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK);
 	sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
 	size = ifp->if_bytes;
 
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 7298c148f848..d440393b40eb 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -543,6 +543,7 @@ xfs_attr_rmtval_stale(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_buf		*bp;
+	int			error;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
@@ -550,14 +551,18 @@ xfs_attr_rmtval_stale(
 	    XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK))
 		return -EFSCORRUPTED;
 
-	bp = xfs_buf_incore(mp->m_ddev_targp,
+	error = xfs_buf_incore(mp->m_ddev_targp,
 			XFS_FSB_TO_DADDR(mp, map->br_startblock),
-			XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags);
-	if (bp) {
-		xfs_buf_stale(bp);
-		xfs_buf_relse(bp);
+			XFS_FSB_TO_BB(mp, map->br_blockcount),
+			incore_flags, &bp);
+	if (error) {
+		if (error == -ENOENT)
+			return 0;
+		return error;
 	}
 
+	xfs_buf_stale(bp);
+	xfs_buf_relse(bp);
 	return 0;
 }
 
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 6833110d1bd4..e56723dc9cd5 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -128,7 +128,7 @@ xfs_bmbt_lookup_first(
  */
 static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
 {
-	struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
 
 	return whichfork != XFS_COW_FORK &&
 		ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
@@ -140,7 +140,7 @@ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
  */
 static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
 {
-	struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
 
 	return whichfork != XFS_COW_FORK &&
 		ifp->if_format == XFS_DINODE_FMT_BTREE &&
@@ -319,7 +319,7 @@ xfs_bmap_check_leaf_extents(
 	int			whichfork)	/* data or attr fork */
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_btree_block	*block;	/* current btree block */
 	xfs_fsblock_t		bno;	/* block # of "block" */
 	struct xfs_buf		*bp;	/* buffer for "block" */
@@ -538,7 +538,7 @@ xfs_bmap_btree_to_extents(
 	int			*logflagsp, /* inode logging flags */
 	int			whichfork)  /* data or attr fork */
 {
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_btree_block	*rblock = ifp->if_broot;
 	struct xfs_btree_block	*cblock;/* child btree block */
@@ -616,7 +616,7 @@ xfs_bmap_extents_to_btree(
 
 	mp = ip->i_mount;
 	ASSERT(whichfork != XFS_COW_FORK);
-	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ifp = xfs_ifork_ptr(ip, whichfork);
 	ASSERT(ifp->if_format == XFS_DINODE_FMT_EXTENTS);
 
 	/*
@@ -745,7 +745,7 @@ xfs_bmap_local_to_extents_empty(
 	struct xfs_inode	*ip,
 	int			whichfork)
 {
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 
 	ASSERT(whichfork != XFS_COW_FORK);
 	ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
@@ -785,7 +785,7 @@ xfs_bmap_local_to_extents(
 	 * So sending the data fork of a regular inode is invalid.
 	 */
 	ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK));
-	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ifp = xfs_ifork_ptr(ip, whichfork);
 	ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
 
 	if (!ifp->if_bytes) {
@@ -880,7 +880,7 @@ xfs_bmap_add_attrfork_btree(
 
 	mp = ip->i_mount;
 
-	if (XFS_BMAP_BMDR_SPACE(block) <= XFS_IFORK_DSIZE(ip))
+	if (XFS_BMAP_BMDR_SPACE(block) <= xfs_inode_data_fork_size(ip))
 		*flags |= XFS_ILOG_DBROOT;
 	else {
 		cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
@@ -920,7 +920,7 @@ xfs_bmap_add_attrfork_extents(
 	int			error;		/* error return value */
 
 	if (ip->i_df.if_nextents * sizeof(struct xfs_bmbt_rec) <=
-	    XFS_IFORK_DSIZE(ip))
+	    xfs_inode_data_fork_size(ip))
 		return 0;
 	cur = NULL;
 	error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags,
@@ -951,7 +951,7 @@ xfs_bmap_add_attrfork_local(
 {
 	struct xfs_da_args	dargs;		/* args for dir/attr code */
 
-	if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
+	if (ip->i_df.if_bytes <= xfs_inode_data_fork_size(ip))
 		return 0;
 
 	if (S_ISDIR(VFS_I(ip)->i_mode)) {
@@ -1023,7 +1023,7 @@ xfs_bmap_add_attrfork(
 	int			logflags;	/* logging flags */
 	int			error;		/* error return value */
 
-	ASSERT(XFS_IFORK_Q(ip) == 0);
+	ASSERT(xfs_inode_has_attr_fork(ip) == 0);
 
 	mp = ip->i_mount;
 	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
@@ -1034,16 +1034,15 @@ xfs_bmap_add_attrfork(
 			rsvd, &tp);
 	if (error)
 		return error;
-	if (XFS_IFORK_Q(ip))
+	if (xfs_inode_has_attr_fork(ip))
 		goto trans_cancel;
 
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	error = xfs_bmap_set_attrforkoff(ip, size, &version);
 	if (error)
 		goto trans_cancel;
-	ASSERT(ip->i_afp == NULL);
 
-	ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0);
+	xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
 	logflags = 0;
 	switch (ip->i_df.if_format) {
 	case XFS_DINODE_FMT_LOCAL:
@@ -1116,7 +1115,7 @@ xfs_iread_bmbt_block(
 	xfs_extnum_t		num_recs;
 	xfs_extnum_t		j;
 	int			whichfork = cur->bc_ino.whichfork;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 
 	block = xfs_btree_get_block(cur, level, &bp);
 
@@ -1164,7 +1163,7 @@ xfs_iread_extents(
 	int			whichfork)
 {
 	struct xfs_iread_state	ir;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_btree_cur	*cur;
 	int			error;
@@ -1208,7 +1207,7 @@ xfs_bmap_first_unused(
 	xfs_fileoff_t		*first_unused,	/* unused block */
 	int			whichfork)	/* data or attr fork */
 {
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_bmbt_irec	got;
 	struct xfs_iext_cursor	icur;
 	xfs_fileoff_t		lastaddr = 0;
@@ -1255,7 +1254,7 @@ xfs_bmap_last_before(
 	xfs_fileoff_t		*last_block,	/* last block */
 	int			whichfork)	/* data or attr fork */
 {
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_bmbt_irec	got;
 	struct xfs_iext_cursor	icur;
 	int			error;
@@ -1289,7 +1288,7 @@ xfs_bmap_last_extent(
 	struct xfs_bmbt_irec	*rec,
 	int			*is_empty)
 {
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_iext_cursor	icur;
 	int			error;
 
@@ -1355,7 +1354,7 @@ xfs_bmap_last_offset(
 	xfs_fileoff_t		*last_block,
 	int			whichfork)
 {
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_bmbt_irec	rec;
 	int			is_empty;
 	int			error;
@@ -1389,7 +1388,7 @@ xfs_bmap_add_extent_delay_real(
 	int			whichfork)
 {
 	struct xfs_mount	*mp = bma->ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(bma->ip, whichfork);
 	struct xfs_bmbt_irec	*new = &bma->got;
 	int			error;	/* error return value */
 	int			i;	/* temp state */
@@ -1955,7 +1954,7 @@ xfs_bmap_add_extent_unwritten_real(
 	*logflagsp = 0;
 
 	cur = *curp;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ifp = xfs_ifork_ptr(ip, whichfork);
 
 	ASSERT(!isnullstartblock(new->br_startblock));
 
@@ -2480,7 +2479,7 @@ xfs_bmap_add_extent_hole_delay(
 	uint32_t		state = xfs_bmap_fork_to_state(whichfork);
 	xfs_filblks_t		temp;	 /* temp for indirect calculations */
 
-	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ifp = xfs_ifork_ptr(ip, whichfork);
 	ASSERT(isnullstartblock(new->br_startblock));
 
 	/*
@@ -2616,7 +2615,7 @@ xfs_bmap_add_extent_hole_real(
 	int			*logflagsp,
 	uint32_t		flags)
 {
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_btree_cur	*cur = *curp;
 	int			error;	/* error return value */
@@ -3185,7 +3184,8 @@ xfs_bmap_longest_free_extent(
 
 	pag = xfs_perag_get(mp, ag);
 	if (!pag->pagf_init) {
-		error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK);
+		error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_TRYLOCK,
+				NULL);
 		if (error) {
 			/* Couldn't lock the AGF, so skip this AG. */
 			if (error == -EAGAIN) {
@@ -3866,7 +3866,7 @@ xfs_bmapi_read(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	int			whichfork = xfs_bmapi_whichfork(flags);
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_bmbt_irec	got;
 	xfs_fileoff_t		obno;
 	xfs_fileoff_t		end;
@@ -3959,7 +3959,7 @@ xfs_bmapi_reserve_delalloc(
 	int			eof)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	xfs_extlen_t		alen;
 	xfs_extlen_t		indlen;
 	int			error;
@@ -4086,7 +4086,7 @@ xfs_bmapi_allocate(
 {
 	struct xfs_mount	*mp = bma->ip->i_mount;
 	int			whichfork = xfs_bmapi_whichfork(bma->flags);
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(bma->ip, whichfork);
 	int			tmp_logflags = 0;
 	int			error;
 
@@ -4185,7 +4185,7 @@ xfs_bmapi_convert_unwritten(
 	uint32_t		flags)
 {
 	int			whichfork = xfs_bmapi_whichfork(flags);
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(bma->ip, whichfork);
 	int			tmp_logflags = 0;
 	int			error;
 
@@ -4262,7 +4262,7 @@ xfs_bmapi_minleft(
 	struct xfs_inode	*ip,
 	int			fork)
 {
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, fork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, fork);
 
 	if (tp && tp->t_firstblock != NULLFSBLOCK)
 		return 0;
@@ -4283,7 +4283,7 @@ xfs_bmapi_finish(
 	int			whichfork,
 	int			error)
 {
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(bma->ip, whichfork);
 
 	if ((bma->logflags & xfs_ilog_fext(whichfork)) &&
 	    ifp->if_format != XFS_DINODE_FMT_EXTENTS)
@@ -4322,7 +4322,7 @@ xfs_bmapi_write(
 	};
 	struct xfs_mount	*mp = ip->i_mount;
 	int			whichfork = xfs_bmapi_whichfork(flags);
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	xfs_fileoff_t		end;		/* end of mapped file region */
 	bool			eof = false;	/* after the end of extents */
 	int			error;		/* error return */
@@ -4503,7 +4503,7 @@ xfs_bmapi_convert_delalloc(
 	struct iomap		*iomap,
 	unsigned int		*seq)
 {
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	struct xfs_bmalloca	bma = { NULL };
@@ -4640,7 +4640,7 @@ xfs_bmapi_remap(
 	int			whichfork = xfs_bmapi_whichfork(flags);
 	int			logflags = 0, error;
 
-	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ifp = xfs_ifork_ptr(ip, whichfork);
 	ASSERT(len > 0);
 	ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -4797,7 +4797,7 @@ xfs_bmap_del_extent_delay(
 	struct xfs_bmbt_irec	*del)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_bmbt_irec	new;
 	int64_t			da_old, da_new, da_diff = 0;
 	xfs_fileoff_t		del_endoff, got_endoff;
@@ -4924,7 +4924,7 @@ xfs_bmap_del_extent_cow(
 	struct xfs_bmbt_irec	*del)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
 	struct xfs_bmbt_irec	new;
 	xfs_fileoff_t		del_endoff, got_endoff;
 	uint32_t		state = BMAP_COWFORK;
@@ -5022,7 +5022,7 @@ xfs_bmap_del_extent_real(
 	mp = ip->i_mount;
 	XFS_STATS_INC(mp, xs_del_exlist);
 
-	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ifp = xfs_ifork_ptr(ip, whichfork);
 	ASSERT(del->br_blockcount > 0);
 	xfs_iext_get_extent(ifp, icur, &got);
 	ASSERT(got.br_startoff <= del->br_startoff);
@@ -5288,7 +5288,7 @@ __xfs_bunmapi(
 
 	whichfork = xfs_bmapi_whichfork(flags);
 	ASSERT(whichfork != XFS_COW_FORK);
-	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ifp = xfs_ifork_ptr(ip, whichfork);
 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)))
 		return -EFSCORRUPTED;
 	if (xfs_is_shutdown(mp))
@@ -5629,7 +5629,7 @@ xfs_bmse_merge(
 	struct xfs_btree_cur		*cur,
 	int				*logflags)	/* output */
 {
-	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_bmbt_irec		new;
 	xfs_filblks_t			blockcount;
 	int				error, i;
@@ -5750,7 +5750,7 @@ xfs_bmap_collapse_extents(
 {
 	int			whichfork = XFS_DATA_FORK;
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_btree_cur	*cur = NULL;
 	struct xfs_bmbt_irec	got, prev;
 	struct xfs_iext_cursor	icur;
@@ -5865,7 +5865,7 @@ xfs_bmap_insert_extents(
 {
 	int			whichfork = XFS_DATA_FORK;
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_btree_cur	*cur = NULL;
 	struct xfs_bmbt_irec	got, next;
 	struct xfs_iext_cursor	icur;
@@ -5965,7 +5965,7 @@ xfs_bmap_split_extent(
 	xfs_fileoff_t		split_fsb)
 {
 	int				whichfork = XFS_DATA_FORK;
-	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_btree_cur		*cur = NULL;
 	struct xfs_bmbt_irec		got;
 	struct xfs_bmbt_irec		new; /* split extent */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 2b77d45c215f..cfa052d40105 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -304,7 +304,7 @@ xfs_bmbt_get_minrecs(
 	if (level == cur->bc_nlevels - 1) {
 		struct xfs_ifork	*ifp;
 
-		ifp = XFS_IFORK_PTR(cur->bc_ino.ip,
+		ifp = xfs_ifork_ptr(cur->bc_ino.ip,
 				    cur->bc_ino.whichfork);
 
 		return xfs_bmbt_maxrecs(cur->bc_mp,
@@ -322,7 +322,7 @@ xfs_bmbt_get_maxrecs(
 	if (level == cur->bc_nlevels - 1) {
 		struct xfs_ifork	*ifp;
 
-		ifp = XFS_IFORK_PTR(cur->bc_ino.ip,
+		ifp = xfs_ifork_ptr(cur->bc_ino.ip,
 				    cur->bc_ino.whichfork);
 
 		return xfs_bmbt_maxrecs(cur->bc_mp,
@@ -550,7 +550,7 @@ xfs_bmbt_init_cursor(
 	struct xfs_inode	*ip,		/* inode owning the btree */
 	int			whichfork)	/* data or attr fork */
 {
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_btree_cur	*cur;
 	ASSERT(whichfork != XFS_COW_FORK);
 
@@ -564,7 +564,7 @@ xfs_bmbt_init_cursor(
 	if (xfs_has_crc(mp))
 		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
 
-	cur->bc_ino.forksize = XFS_IFORK_SIZE(ip, whichfork);
+	cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
 	cur->bc_ino.ip = ip;
 	cur->bc_ino.allocated = 0;
 	cur->bc_ino.flags = 0;
@@ -664,7 +664,7 @@ xfs_bmbt_change_owner(
 
 	ASSERT(tp || buffer_list);
 	ASSERT(!(tp && buffer_list));
-	ASSERT(XFS_IFORK_PTR(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE);
+	ASSERT(xfs_ifork_ptr(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE);
 
 	cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
 	cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER;
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 2eecc49fc1b2..4c16c8c31fcb 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -91,10 +91,9 @@ xfs_btree_check_lblock_siblings(
 
 static inline xfs_failaddr_t
 xfs_btree_check_sblock_siblings(
-	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
 	struct xfs_btree_cur	*cur,
 	int			level,
-	xfs_agnumber_t		agno,
 	xfs_agblock_t		agbno,
 	__be32			dsibling)
 {
@@ -110,7 +109,7 @@ xfs_btree_check_sblock_siblings(
 		if (!xfs_btree_check_sptr(cur, sibling, level + 1))
 			return __this_address;
 	} else {
-		if (!xfs_verify_agbno(mp, agno, sibling))
+		if (!xfs_verify_agbno(pag, sibling))
 			return __this_address;
 	}
 	return NULL;
@@ -195,11 +194,11 @@ __xfs_btree_check_sblock(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_perag	*pag = cur->bc_ag.pag;
 	xfs_btnum_t		btnum = cur->bc_btnum;
 	int			crc = xfs_has_crc(mp);
 	xfs_failaddr_t		fa;
 	xfs_agblock_t		agbno = NULLAGBLOCK;
-	xfs_agnumber_t		agno = NULLAGNUMBER;
 
 	if (crc) {
 		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
@@ -217,16 +216,14 @@ __xfs_btree_check_sblock(
 	    cur->bc_ops->get_maxrecs(cur, level))
 		return __this_address;
 
-	if (bp) {
+	if (bp)
 		agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
-		agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
-	}
 
-	fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno,
+	fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
 			block->bb_u.s.bb_leftsib);
 	if (!fa)
-		fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno,
-				 agbno, block->bb_u.s.bb_rightsib);
+		fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
+				block->bb_u.s.bb_rightsib);
 	return fa;
 }
 
@@ -288,7 +285,7 @@ xfs_btree_check_sptr(
 {
 	if (level <= 0)
 		return false;
-	return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno);
+	return xfs_verify_agbno(cur->bc_ag.pag, agbno);
 }
 
 /*
@@ -725,7 +722,7 @@ xfs_btree_ifork_ptr(
 
 	if (cur->bc_flags & XFS_BTREE_STAGING)
 		return cur->bc_ino.ifake->if_fork;
-	return XFS_IFORK_PTR(cur->bc_ino.ip, cur->bc_ino.whichfork);
+	return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork);
 }
 
 /*
@@ -3559,7 +3556,7 @@ xfs_btree_kill_iroot(
 {
 	int			whichfork = cur->bc_ino.whichfork;
 	struct xfs_inode	*ip = cur->bc_ino.ip;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_btree_block	*block;
 	struct xfs_btree_block	*cblock;
 	union xfs_btree_key	*kp;
@@ -4595,7 +4592,6 @@ xfs_btree_sblock_verify(
 {
 	struct xfs_mount	*mp = bp->b_mount;
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
-	xfs_agnumber_t		agno;
 	xfs_agblock_t		agbno;
 	xfs_failaddr_t		fa;
 
@@ -4604,12 +4600,11 @@ xfs_btree_sblock_verify(
 		return __this_address;
 
 	/* sibling pointer verification */
-	agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
 	agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
-	fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno,
+	fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
 			block->bb_u.s.bb_leftsib);
 	if (!fa)
-		fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno,
+		fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
 				block->bb_u.s.bb_rightsib);
 	return fa;
 }
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 3cd51fa3837b..76eedc2756b3 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -193,7 +193,7 @@ xfs_dir_isempty(
 	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
 	if (dp->i_disk_size == 0)	/* might happen during shutdown. */
 		return 1;
-	if (dp->i_disk_size > XFS_IFORK_DSIZE(dp))
+	if (dp->i_disk_size > xfs_inode_data_fork_size(dp))
 		return 0;
 	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	return !sfp->count;
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index df0869bba275..00f960a703b2 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -842,7 +842,7 @@ xfs_dir2_block_removename(
 	 * See if the size as a shortform is good enough.
 	 */
 	size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
-	if (size > XFS_IFORK_DSIZE(dp))
+	if (size > xfs_inode_data_fork_size(dp))
 		return 0;
 
 	/*
@@ -1055,7 +1055,7 @@ xfs_dir2_leaf_to_block(
 	 * Now see if the resulting block can be shrunken to shortform.
 	 */
 	size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
-	if (size > XFS_IFORK_DSIZE(dp))
+	if (size > xfs_inode_data_fork_size(dp))
 		return 0;
 
 	return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
@@ -1071,7 +1071,7 @@ xfs_dir2_sf_to_block(
 	struct xfs_trans	*tp = args->trans;
 	struct xfs_inode	*dp = args->dp;
 	struct xfs_mount	*mp = dp->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK);
 	struct xfs_da_geometry	*geo = args->geo;
 	xfs_dir2_db_t		blkno;		/* dir-relative block # (0) */
 	xfs_dir2_data_hdr_t	*hdr;		/* block header */
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 5a97a87eaa20..003812fd7d35 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -237,7 +237,7 @@ xfs_dir2_block_sfsize(
 		       (i8count ?			/* inumber */
 				count * XFS_INO64_SIZE :
 				count * XFS_INO32_SIZE);
-		if (size > XFS_IFORK_DSIZE(dp))
+		if (size > xfs_inode_data_fork_size(dp))
 			return size;		/* size value is a failure */
 	}
 	/*
@@ -406,7 +406,7 @@ xfs_dir2_sf_addname(
 	 * Won't fit as shortform any more (due to size),
 	 * or the pick routine says it won't (due to offset values).
 	 */
-	if (new_isize > XFS_IFORK_DSIZE(dp) ||
+	if (new_isize > xfs_inode_data_fork_size(dp) ||
 	    (pick =
 	     xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) {
 		/*
@@ -710,7 +710,7 @@ xfs_dir2_sf_verify(
 	struct xfs_inode		*ip)
 {
 	struct xfs_mount		*mp = ip->i_mount;
-	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
 	struct xfs_dir2_sf_hdr		*sfp;
 	struct xfs_dir2_sf_entry	*sfep;
 	struct xfs_dir2_sf_entry	*next_sfep;
@@ -1031,7 +1031,7 @@ xfs_dir2_sf_replace_needblock(
 	newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
 
 	return inum > XFS_DIR2_MAX_SHORT_INUM &&
-	       sfp->i8count == 0 && newsize > XFS_IFORK_DSIZE(dp);
+	       sfp->i8count == 0 && newsize > xfs_inode_data_fork_size(dp);
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index afdfc8108c5f..b55bdfa9c8a8 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -704,7 +704,7 @@ struct xfs_agfl {
  * When the bigtime feature is enabled, ondisk inode timestamps become an
  * unsigned 64-bit nanoseconds counter.  This means that the bigtime inode
  * timestamp epoch is the start of the classic timestamp range, which is
- * Dec 31 20:45:52 UTC 1901.  Because the epochs are not the same, callers
+ * Dec 13 20:45:52 UTC 1901.  Because the epochs are not the same, callers
  * /must/ use the bigtime conversion functions when encoding and decoding raw
  * timestamps.
  */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index bf2f4bc89193..6cdfd64bc56b 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -105,7 +105,6 @@ xfs_inobt_get_rec(
 	int				*stat)
 {
 	struct xfs_mount		*mp = cur->bc_mp;
-	xfs_agnumber_t			agno = cur->bc_ag.pag->pag_agno;
 	union xfs_btree_rec		*rec;
 	int				error;
 	uint64_t			realfree;
@@ -116,7 +115,7 @@ xfs_inobt_get_rec(
 
 	xfs_inobt_btrec_to_irec(mp, rec, irec);
 
-	if (!xfs_verify_agino(mp, agno, irec->ir_startino))
+	if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino))
 		goto out_bad_rec;
 	if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
 	    irec->ir_count > XFS_INODES_PER_CHUNK)
@@ -137,7 +136,8 @@ xfs_inobt_get_rec(
 out_bad_rec:
 	xfs_warn(mp,
 		"%s Inode BTree record corruption in AG %d detected!",
-		cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", agno);
+		cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free",
+		cur->bc_ag.pag->pag_agno);
 	xfs_warn(mp,
 "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
 		irec->ir_startino, irec->ir_count, irec->ir_freecount,
@@ -1610,7 +1610,7 @@ xfs_dialloc_good_ag(
 		return false;
 
 	if (!pag->pagi_init) {
-		error = xfs_ialloc_pagi_init(mp, tp, pag->pag_agno);
+		error = xfs_ialloc_read_agi(pag, tp, NULL);
 		if (error)
 			return false;
 	}
@@ -1621,7 +1621,7 @@ xfs_dialloc_good_ag(
 		return false;
 
 	if (!pag->pagf_init) {
-		error = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, flags);
+		error = xfs_alloc_read_agf(pag, tp, flags, NULL);
 		if (error)
 			return false;
 	}
@@ -1679,7 +1679,7 @@ xfs_dialloc_try_ag(
 	 * Then read in the AGI buffer and recheck with the AGI buffer
 	 * lock held.
 	 */
-	error = xfs_ialloc_read_agi(pag->pag_mount, *tpp, pag->pag_agno, &agbp);
+	error = xfs_ialloc_read_agi(pag, *tpp, &agbp);
 	if (error)
 		return error;
 
@@ -2169,7 +2169,7 @@ xfs_difree(
 	/*
 	 * Get the allocation group header.
 	 */
-	error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp);
+	error = xfs_ialloc_read_agi(pag, tp, &agbp);
 	if (error) {
 		xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
 			__func__, error);
@@ -2215,7 +2215,7 @@ xfs_imap_lookup(
 	int			error;
 	int			i;
 
-	error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp);
+	error = xfs_ialloc_read_agi(pag, tp, &agbp);
 	if (error) {
 		xfs_alert(mp,
 			"%s: xfs_ialloc_read_agi() returned error %d, agno %d",
@@ -2571,47 +2571,48 @@ const struct xfs_buf_ops xfs_agi_buf_ops = {
  */
 int
 xfs_read_agi(
-	struct xfs_mount	*mp,	/* file system mount structure */
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_agnumber_t		agno,	/* allocation group number */
-	struct xfs_buf		**bpp)	/* allocation group hdr buf */
+	struct xfs_perag	*pag,
+	struct xfs_trans	*tp,
+	struct xfs_buf		**agibpp)
 {
+	struct xfs_mount	*mp = pag->pag_mount;
 	int			error;
 
-	trace_xfs_read_agi(mp, agno);
+	trace_xfs_read_agi(pag->pag_mount, pag->pag_agno);
 
-	ASSERT(agno != NULLAGNUMBER);
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-			XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-			XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
+			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops);
 	if (error)
 		return error;
 	if (tp)
-		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_AGI_BUF);
+		xfs_trans_buf_set_type(tp, *agibpp, XFS_BLFT_AGI_BUF);
 
-	xfs_buf_set_ref(*bpp, XFS_AGI_REF);
+	xfs_buf_set_ref(*agibpp, XFS_AGI_REF);
 	return 0;
 }
 
+/*
+ * Read in the agi and initialise the per-ag data. If the caller supplies a
+ * @agibpp, return the locked AGI buffer to them, otherwise release it.
+ */
 int
 xfs_ialloc_read_agi(
-	struct xfs_mount	*mp,	/* file system mount structure */
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_agnumber_t		agno,	/* allocation group number */
-	struct xfs_buf		**bpp)	/* allocation group hdr buf */
+	struct xfs_perag	*pag,
+	struct xfs_trans	*tp,
+	struct xfs_buf		**agibpp)
 {
-	struct xfs_agi		*agi;	/* allocation group header */
-	struct xfs_perag	*pag;	/* per allocation group data */
+	struct xfs_buf		*agibp;
+	struct xfs_agi		*agi;
 	int			error;
 
-	trace_xfs_ialloc_read_agi(mp, agno);
+	trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno);
 
-	error = xfs_read_agi(mp, tp, agno, bpp);
+	error = xfs_read_agi(pag, tp, &agibp);
 	if (error)
 		return error;
 
-	agi = (*bpp)->b_addr;
-	pag = (*bpp)->b_pag;
+	agi = agibp->b_addr;
 	if (!pag->pagi_init) {
 		pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
 		pag->pagi_count = be32_to_cpu(agi->agi_count);
@@ -2623,27 +2624,11 @@ xfs_ialloc_read_agi(
 	 * we are in the middle of a forced shutdown.
 	 */
 	ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
-		xfs_is_shutdown(mp));
-	return 0;
-}
-
-/*
- * Read in the agi to initialise the per-ag data in the mount structure
- */
-int
-xfs_ialloc_pagi_init(
-	xfs_mount_t	*mp,		/* file system mount structure */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_agnumber_t	agno)		/* allocation group number */
-{
-	struct xfs_buf	*bp = NULL;
-	int		error;
-
-	error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
-	if (error)
-		return error;
-	if (bp)
-		xfs_trans_brelse(tp, bp);
+		xfs_is_shutdown(pag->pag_mount));
+	if (agibpp)
+		*agibpp = agibp;
+	else
+		xfs_trans_brelse(tp, agibp);
 	return 0;
 }
 
@@ -2912,8 +2897,7 @@ xfs_ialloc_calc_rootino(
 	 * allocation group, or very odd geometries created by old mkfs
 	 * versions on very small filesystems.
 	 */
-	if (mp->m_sb.sb_logstart &&
-	    XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0)
+	if (xfs_ag_contains_log(mp, 0))
 		 first_bno += mp->m_sb.sb_logblocks;
 
 	/*
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index a7705b6a1fd3..9bbbca6ac4ed 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -62,25 +62,10 @@ xfs_ialloc_log_agi(
 	struct xfs_buf	*bp,		/* allocation group header buffer */
 	uint32_t	fields);	/* bitmask of fields to log */
 
-/*
- * Read in the allocation group header (inode allocation section)
- */
-int					/* error */
-xfs_ialloc_read_agi(
-	struct xfs_mount *mp,		/* file system mount structure */
-	struct xfs_trans *tp,		/* transaction pointer */
-	xfs_agnumber_t	agno,		/* allocation group number */
-	struct xfs_buf	**bpp);		/* allocation group hdr buf */
-
-/*
- * Read in the allocation group header to initialise the per-ag data
- * in the mount structure
- */
-int
-xfs_ialloc_pagi_init(
-	struct xfs_mount *mp,		/* file system mount structure */
-	struct xfs_trans *tp,		/* transaction pointer */
-        xfs_agnumber_t  agno);		/* allocation group number */
+int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp,
+		struct xfs_buf **agibpp);
+int xfs_ialloc_read_agi(struct xfs_perag *pag, struct xfs_trans *tp,
+		struct xfs_buf **agibpp);
 
 /*
  * Lookup a record by ino in the btree given by cur.
@@ -102,8 +87,6 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
 			  xfs_agnumber_t agno, xfs_agblock_t agbno,
 			  xfs_agblock_t length, unsigned int gen);
 
-int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
-		xfs_agnumber_t agno, struct xfs_buf **bpp);
 
 union xfs_btree_rec;
 void xfs_inobt_btrec_to_irec(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index b2ad2fdc40f5..8c83e265770c 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -683,10 +683,10 @@ xfs_inobt_rec_check_count(
 
 static xfs_extlen_t
 xfs_inobt_max_size(
-	struct xfs_mount	*mp,
-	xfs_agnumber_t		agno)
+	struct xfs_perag	*pag)
 {
-	xfs_agblock_t		agblocks = xfs_ag_block_count(mp, agno);
+	struct xfs_mount	*mp = pag->pag_mount;
+	xfs_agblock_t		agblocks = pag->block_count;
 
 	/* Bail out if we're uninitialized, which can happen in mkfs. */
 	if (M_IGEO(mp)->inobt_mxr[0] == 0)
@@ -697,8 +697,7 @@ xfs_inobt_max_size(
 	 * never be available for the kinds of things that would require btree
 	 * expansion.  We therefore can pretend the space isn't there.
 	 */
-	if (mp->m_sb.sb_logstart &&
-	    XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno)
+	if (xfs_ag_contains_log(mp, pag->pag_agno))
 		agblocks -= mp->m_sb.sb_logblocks;
 
 	return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr,
@@ -722,7 +721,7 @@ xfs_inobt_cur(
 	ASSERT(*agi_bpp == NULL);
 	ASSERT(*curpp == NULL);
 
-	error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, agi_bpp);
+	error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
 	if (error)
 		return error;
 
@@ -757,16 +756,15 @@ xfs_inobt_count_blocks(
 /* Read finobt block count from AGI header. */
 static int
 xfs_finobt_read_blocks(
-	struct xfs_mount	*mp,
-	struct xfs_trans	*tp,
 	struct xfs_perag	*pag,
+	struct xfs_trans	*tp,
 	xfs_extlen_t		*tree_blocks)
 {
 	struct xfs_buf		*agbp;
 	struct xfs_agi		*agi;
 	int			error;
 
-	error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp);
+	error = xfs_ialloc_read_agi(pag, tp, &agbp);
 	if (error)
 		return error;
 
@@ -794,14 +792,14 @@ xfs_finobt_calc_reserves(
 		return 0;
 
 	if (xfs_has_inobtcounts(mp))
-		error = xfs_finobt_read_blocks(mp, tp, pag, &tree_len);
+		error = xfs_finobt_read_blocks(pag, tp, &tree_len);
 	else
 		error = xfs_inobt_count_blocks(mp, tp, pag, XFS_BTNUM_FINO,
 				&tree_len);
 	if (error)
 		return error;
 
-	*ask += xfs_inobt_max_size(mp, pag->pag_agno);
+	*ask += xfs_inobt_max_size(pag);
 	*used += tree_len;
 	return 0;
 }
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 3b1b63f9d886..758aacd8166b 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -10,6 +10,7 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
+#include "xfs_ag.h"
 #include "xfs_inode.h"
 #include "xfs_errortag.h"
 #include "xfs_error.h"
@@ -41,14 +42,12 @@ xfs_inode_buf_verify(
 	bool		readahead)
 {
 	struct xfs_mount *mp = bp->b_mount;
-	xfs_agnumber_t	agno;
 	int		i;
 	int		ni;
 
 	/*
 	 * Validate the magic number and version of every inode in the buffer
 	 */
-	agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
 	ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
 	for (i = 0; i < ni; i++) {
 		struct xfs_dinode	*dip;
@@ -59,7 +58,7 @@ xfs_inode_buf_verify(
 		unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
 		di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
 			xfs_dinode_good_version(mp, dip->di_version) &&
-			xfs_verify_agino_or_null(mp, agno, unlinked_ino);
+			xfs_verify_agino_or_null(bp->b_pag, unlinked_ino);
 		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
 						XFS_ERRTAG_ITOBP_INOTOBP))) {
 			if (readahead) {
@@ -178,7 +177,6 @@ xfs_inode_from_disk(
 	xfs_failaddr_t		fa;
 
 	ASSERT(ip->i_cowfp == NULL);
-	ASSERT(ip->i_afp == NULL);
 
 	fa = xfs_dinode_verify(ip->i_mount, ip->i_ino, from);
 	if (fa) {
@@ -230,7 +228,8 @@ xfs_inode_from_disk(
 	ip->i_nblocks = be64_to_cpu(from->di_nblocks);
 	ip->i_extsize = be32_to_cpu(from->di_extsize);
 	ip->i_forkoff = from->di_forkoff;
-	ip->i_diflags	= be16_to_cpu(from->di_flags);
+	ip->i_diflags = be16_to_cpu(from->di_flags);
+	ip->i_next_unlinked = be32_to_cpu(from->di_next_unlinked);
 
 	if (from->di_dmevmask || from->di_dmstate)
 		xfs_iflags_set(ip, XFS_IPRESERVE_DM_FIELDS);
@@ -286,7 +285,7 @@ xfs_inode_to_disk_iext_counters(
 {
 	if (xfs_inode_has_large_extent_counts(ip)) {
 		to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df));
-		to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(ip->i_afp));
+		to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_af));
 		/*
 		 * We might be upgrading the inode to use larger extent counters
 		 * than was previously used. Hence zero the unused field.
@@ -294,7 +293,7 @@ xfs_inode_to_disk_iext_counters(
 		to->di_nrext64_pad = cpu_to_be16(0);
 	} else {
 		to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df));
-		to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp));
+		to->di_anextents = cpu_to_be16(xfs_ifork_nextents(&ip->i_af));
 	}
 }
 
@@ -326,7 +325,7 @@ xfs_inode_to_disk(
 	to->di_nblocks = cpu_to_be64(ip->i_nblocks);
 	to->di_extsize = cpu_to_be32(ip->i_extsize);
 	to->di_forkoff = ip->i_forkoff;
-	to->di_aformat = xfs_ifork_format(ip->i_afp);
+	to->di_aformat = xfs_ifork_format(&ip->i_af);
 	to->di_flags = cpu_to_be16(ip->i_diflags);
 
 	if (xfs_has_v3inodes(ip->i_mount)) {
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 1a4cdf550f6d..9327a4f39206 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -35,7 +35,7 @@ xfs_init_local_fork(
 	const void		*data,
 	int64_t			size)
 {
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	int			mem_size = size;
 	bool			zero_terminate;
 
@@ -102,7 +102,7 @@ xfs_iformat_extents(
 	int			whichfork)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	int			state = xfs_bmap_fork_to_state(whichfork);
 	xfs_extnum_t		nex = xfs_dfork_nextents(dip, whichfork);
 	int			size = nex * sizeof(xfs_bmbt_rec_t);
@@ -173,7 +173,7 @@ xfs_iformat_btree(
 	int			size;
 	int			level;
 
-	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ifp = xfs_ifork_ptr(ip, whichfork);
 	dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
 	size = XFS_BMAP_BROOT_SPACE(mp, dfp);
 	nrecs = be16_to_cpu(dfp->bb_numrecs);
@@ -276,17 +276,23 @@ xfs_dfork_attr_shortform_size(
 	return be16_to_cpu(atp->hdr.totsize);
 }
 
-struct xfs_ifork *
-xfs_ifork_alloc(
+void
+xfs_ifork_init_attr(
+	struct xfs_inode	*ip,
 	enum xfs_dinode_fmt	format,
 	xfs_extnum_t		nextents)
 {
-	struct xfs_ifork	*ifp;
+	ip->i_af.if_format = format;
+	ip->i_af.if_nextents = nextents;
+}
 
-	ifp = kmem_cache_zalloc(xfs_ifork_cache, GFP_NOFS | __GFP_NOFAIL);
-	ifp->if_format = format;
-	ifp->if_nextents = nextents;
-	return ifp;
+void
+xfs_ifork_zap_attr(
+	struct xfs_inode	*ip)
+{
+	xfs_idestroy_fork(&ip->i_af);
+	memset(&ip->i_af, 0, sizeof(struct xfs_ifork));
+	ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
 }
 
 int
@@ -301,9 +307,9 @@ xfs_iformat_attr_fork(
 	 * Initialize the extent count early, as the per-format routines may
 	 * depend on it.
 	 */
-	ip->i_afp = xfs_ifork_alloc(dip->di_aformat, naextents);
+	xfs_ifork_init_attr(ip, dip->di_aformat, naextents);
 
-	switch (ip->i_afp->if_format) {
+	switch (ip->i_af.if_format) {
 	case XFS_DINODE_FMT_LOCAL:
 		error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK,
 				xfs_dfork_attr_shortform_size(dip));
@@ -323,10 +329,8 @@ xfs_iformat_attr_fork(
 		break;
 	}
 
-	if (error) {
-		kmem_cache_free(xfs_ifork_cache, ip->i_afp);
-		ip->i_afp = NULL;
-	}
+	if (error)
+		xfs_ifork_zap_attr(ip);
 	return error;
 }
 
@@ -370,7 +374,7 @@ xfs_iroot_realloc(
 		return;
 	}
 
-	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ifp = xfs_ifork_ptr(ip, whichfork);
 	if (rec_diff > 0) {
 		/*
 		 * If there wasn't any memory allocated before, just
@@ -400,7 +404,7 @@ xfs_iroot_realloc(
 						     (int)new_size);
 		ifp->if_broot_bytes = (int)new_size;
 		ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
-			XFS_IFORK_SIZE(ip, whichfork));
+			xfs_inode_fork_size(ip, whichfork));
 		memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
 		return;
 	}
@@ -454,7 +458,7 @@ xfs_iroot_realloc(
 	ifp->if_broot_bytes = (int)new_size;
 	if (ifp->if_broot)
 		ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
-			XFS_IFORK_SIZE(ip, whichfork));
+			xfs_inode_fork_size(ip, whichfork));
 	return;
 }
 
@@ -480,11 +484,11 @@ xfs_idata_realloc(
 	int64_t			byte_diff,
 	int			whichfork)
 {
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	int64_t			new_size = ifp->if_bytes + byte_diff;
 
 	ASSERT(new_size >= 0);
-	ASSERT(new_size <= XFS_IFORK_SIZE(ip, whichfork));
+	ASSERT(new_size <= xfs_inode_fork_size(ip, whichfork));
 
 	if (byte_diff == 0)
 		return;
@@ -539,7 +543,7 @@ xfs_iextents_copy(
 	int			whichfork)
 {
 	int			state = xfs_bmap_fork_to_state(whichfork);
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_iext_cursor	icur;
 	struct xfs_bmbt_irec	rec;
 	int64_t			copied = 0;
@@ -591,7 +595,7 @@ xfs_iflush_fork(
 
 	if (!iip)
 		return;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ifp = xfs_ifork_ptr(ip, whichfork);
 	/*
 	 * This can happen if we gave up in iformat in an error path,
 	 * for the attribute fork.
@@ -607,7 +611,7 @@ xfs_iflush_fork(
 		if ((iip->ili_fields & dataflag[whichfork]) &&
 		    (ifp->if_bytes > 0)) {
 			ASSERT(ifp->if_u1.if_data != NULL);
-			ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+			ASSERT(ifp->if_bytes <= xfs_inode_fork_size(ip, whichfork));
 			memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
 		}
 		break;
@@ -626,7 +630,7 @@ xfs_iflush_fork(
 		    (ifp->if_broot_bytes > 0)) {
 			ASSERT(ifp->if_broot != NULL);
 			ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
-			        XFS_IFORK_SIZE(ip, whichfork));
+			        xfs_inode_fork_size(ip, whichfork));
 			xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
 				(xfs_bmdr_block_t *)cp,
 				XFS_DFORK_SIZE(dip, mp, whichfork));
@@ -656,7 +660,7 @@ xfs_iext_state_to_fork(
 	if (state & BMAP_COWFORK)
 		return ip->i_cowfp;
 	else if (state & BMAP_ATTRFORK)
-		return ip->i_afp;
+		return &ip->i_af;
 	return &ip->i_df;
 }
 
@@ -707,18 +711,17 @@ int
 xfs_ifork_verify_local_attr(
 	struct xfs_inode	*ip)
 {
-	struct xfs_ifork	*ifp = ip->i_afp;
+	struct xfs_ifork	*ifp = &ip->i_af;
 	xfs_failaddr_t		fa;
 
-	if (!ifp)
+	if (!xfs_inode_has_attr_fork(ip))
 		fa = __this_address;
 	else
 		fa = xfs_attr_shortform_verify(ip);
 
 	if (fa) {
 		xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
-				ifp ? ifp->if_u1.if_data : NULL,
-				ifp ? ifp->if_bytes : 0, fa);
+				ifp->if_u1.if_data, ifp->if_bytes, fa);
 		return -EFSCORRUPTED;
 	}
 
@@ -731,7 +734,7 @@ xfs_iext_count_may_overflow(
 	int			whichfork,
 	int			nr_to_add)
 {
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	uint64_t		max_exts;
 	uint64_t		nr_exts;
 
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 4f68c1f20beb..d3943d6ad0b9 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -77,28 +77,8 @@ struct xfs_ifork {
 /*
  * Fork handling.
  */
-
-#define XFS_IFORK_Q(ip)			((ip)->i_forkoff != 0)
-#define XFS_IFORK_BOFF(ip)		((int)((ip)->i_forkoff << 3))
-
-#define XFS_IFORK_PTR(ip,w)		\
-	((w) == XFS_DATA_FORK ? \
-		&(ip)->i_df : \
-		((w) == XFS_ATTR_FORK ? \
-			(ip)->i_afp : \
-			(ip)->i_cowfp))
-#define XFS_IFORK_DSIZE(ip) \
-	(XFS_IFORK_Q(ip) ? XFS_IFORK_BOFF(ip) : XFS_LITINO((ip)->i_mount))
-#define XFS_IFORK_ASIZE(ip) \
-	(XFS_IFORK_Q(ip) ? XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : 0)
-#define XFS_IFORK_SIZE(ip,w) \
-	((w) == XFS_DATA_FORK ? \
-		XFS_IFORK_DSIZE(ip) : \
-		((w) == XFS_ATTR_FORK ? \
-			XFS_IFORK_ASIZE(ip) : \
-			0))
 #define XFS_IFORK_MAXEXT(ip, w) \
-	(XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
+	(xfs_inode_fork_size(ip, w) / sizeof(xfs_bmbt_rec_t))
 
 static inline bool xfs_ifork_has_extents(struct xfs_ifork *ifp)
 {
@@ -179,8 +159,9 @@ xfs_dfork_nextents(
 	return 0;
 }
 
-struct xfs_ifork *xfs_ifork_alloc(enum xfs_dinode_fmt format,
-				xfs_extnum_t nextents);
+void xfs_ifork_zap_attr(struct xfs_inode *ip);
+void xfs_ifork_init_attr(struct xfs_inode *ip, enum xfs_dinode_fmt format,
+		xfs_extnum_t nextents);
 struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
 
 int		xfs_iformat_data_fork(struct xfs_inode *, struct xfs_dinode *);
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 97e9e6020596..64b910caafaa 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -111,7 +111,7 @@ xfs_refcount_get_rec(
 	int				*stat)
 {
 	struct xfs_mount		*mp = cur->bc_mp;
-	xfs_agnumber_t			agno = cur->bc_ag.pag->pag_agno;
+	struct xfs_perag		*pag = cur->bc_ag.pag;
 	union xfs_btree_rec		*rec;
 	int				error;
 	xfs_agblock_t			realstart;
@@ -121,8 +121,6 @@ xfs_refcount_get_rec(
 		return error;
 
 	xfs_refcount_btrec_to_irec(rec, irec);
-
-	agno = cur->bc_ag.pag->pag_agno;
 	if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
 		goto out_bad_rec;
 
@@ -137,22 +135,23 @@ xfs_refcount_get_rec(
 	}
 
 	/* check for valid extent range, including overflow */
-	if (!xfs_verify_agbno(mp, agno, realstart))
+	if (!xfs_verify_agbno(pag, realstart))
 		goto out_bad_rec;
 	if (realstart > realstart + irec->rc_blockcount)
 		goto out_bad_rec;
-	if (!xfs_verify_agbno(mp, agno, realstart + irec->rc_blockcount - 1))
+	if (!xfs_verify_agbno(pag, realstart + irec->rc_blockcount - 1))
 		goto out_bad_rec;
 
 	if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT)
 		goto out_bad_rec;
 
-	trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+	trace_xfs_refcount_get(cur->bc_mp, pag->pag_agno, irec);
 	return 0;
 
 out_bad_rec:
 	xfs_warn(mp,
-		"Refcount BTree record corruption in AG %d detected!", agno);
+		"Refcount BTree record corruption in AG %d detected!",
+		pag->pag_agno);
 	xfs_warn(mp,
 		"Start block 0x%x, block count 0x%x, references 0x%x",
 		irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount);
@@ -1177,8 +1176,8 @@ xfs_refcount_finish_one(
 		*pcur = NULL;
 	}
 	if (rcur == NULL) {
-		error = xfs_alloc_read_agf(tp->t_mountp, tp, pag->pag_agno,
-				XFS_ALLOC_FLAG_FREEING, &agbp);
+		error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_FREEING,
+				&agbp);
 		if (error)
 			goto out_drop;
 
@@ -1710,7 +1709,7 @@ xfs_refcount_recover_cow_leftovers(
 	if (error)
 		return error;
 
-	error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp);
+	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
 	if (error)
 		goto out_trans;
 	cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index d14c1720b0fb..316c1ec0c3c2 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -493,7 +493,7 @@ xfs_refcountbt_calc_reserves(
 	if (!xfs_has_reflink(mp))
 		return 0;
 
-	error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp);
+	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
 	if (error)
 		return error;
 
@@ -507,8 +507,7 @@ xfs_refcountbt_calc_reserves(
 	 * never be available for the kinds of things that would require btree
 	 * expansion.  We therefore can pretend the space isn't there.
 	 */
-	if (mp->m_sb.sb_logstart &&
-	    XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno)
+	if (xfs_ag_contains_log(mp, pag->pag_agno))
 		agblocks -= mp->m_sb.sb_logblocks;
 
 	*ask += xfs_refcountbt_max_size(mp, agblocks);
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 2845019d31da..094dfc897ebc 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -215,7 +215,7 @@ xfs_rmap_get_rec(
 	int			*stat)
 {
 	struct xfs_mount	*mp = cur->bc_mp;
-	xfs_agnumber_t		agno = cur->bc_ag.pag->pag_agno;
+	struct xfs_perag	*pag = cur->bc_ag.pag;
 	union xfs_btree_rec	*rec;
 	int			error;
 
@@ -235,12 +235,12 @@ xfs_rmap_get_rec(
 			goto out_bad_rec;
 	} else {
 		/* check for valid extent range, including overflow */
-		if (!xfs_verify_agbno(mp, agno, irec->rm_startblock))
+		if (!xfs_verify_agbno(pag, irec->rm_startblock))
 			goto out_bad_rec;
 		if (irec->rm_startblock >
 				irec->rm_startblock + irec->rm_blockcount)
 			goto out_bad_rec;
-		if (!xfs_verify_agbno(mp, agno,
+		if (!xfs_verify_agbno(pag,
 				irec->rm_startblock + irec->rm_blockcount - 1))
 			goto out_bad_rec;
 	}
@@ -254,7 +254,7 @@ xfs_rmap_get_rec(
 out_bad_rec:
 	xfs_warn(mp,
 		"Reverse Mapping BTree record corruption in AG %d detected!",
-		agno);
+		pag->pag_agno);
 	xfs_warn(mp,
 		"Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x",
 		irec->rm_owner, irec->rm_flags, irec->rm_startblock,
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 69e104d0277f..7f83f62e51e0 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -90,7 +90,7 @@ xfs_rmapbt_alloc_block(
 	xfs_agblock_t		bno;
 
 	/* Allocate the new block from the freelist. If we can't, give up.  */
-	error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp,
+	error = xfs_alloc_get_freelist(pag, cur->bc_tp, cur->bc_ag.agbp,
 				       &bno, 1);
 	if (error)
 		return error;
@@ -129,7 +129,7 @@ xfs_rmapbt_free_block(
 			bno, 1);
 	be32_add_cpu(&agf->agf_rmap_blocks, -1);
 	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
-	error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
+	error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1);
 	if (error)
 		return error;
 
@@ -652,7 +652,7 @@ xfs_rmapbt_calc_reserves(
 	if (!xfs_has_rmapbt(mp))
 		return 0;
 
-	error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp);
+	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
 	if (error)
 		return error;
 
@@ -666,8 +666,7 @@ xfs_rmapbt_calc_reserves(
 	 * never be available for the kinds of things that would require btree
 	 * expansion.  We therefore can pretend the space isn't there.
 	 */
-	if (mp->m_sb.sb_logstart &&
-	    XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno)
+	if (xfs_ag_contains_log(mp, pag->pag_agno))
 		agblocks -= mp->m_sb.sb_logblocks;
 
 	/* Reserve 1% of the AG or enough for 1 block per record. */
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 8b9bd178a487..bdc777b9ec4a 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -204,7 +204,7 @@ xfs_failaddr_t
 xfs_symlink_shortform_verify(
 	struct xfs_inode	*ip)
 {
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
 	char			*sfp = (char *)ifp->if_u1.if_data;
 	int			size = ifp->if_bytes;
 	char			*endp = sfp + size;
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index e810d23f2d97..5c2765934732 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -13,25 +13,13 @@
 #include "xfs_mount.h"
 #include "xfs_ag.h"
 
-/* Find the size of the AG, in blocks. */
-inline xfs_agblock_t
-xfs_ag_block_count(
-	struct xfs_mount	*mp,
-	xfs_agnumber_t		agno)
-{
-	ASSERT(agno < mp->m_sb.sb_agcount);
-
-	if (agno < mp->m_sb.sb_agcount - 1)
-		return mp->m_sb.sb_agblocks;
-	return mp->m_sb.sb_dblocks - (agno * mp->m_sb.sb_agblocks);
-}
 
 /*
  * Verify that an AG block number pointer neither points outside the AG
  * nor points at static metadata.
  */
-inline bool
-xfs_verify_agbno(
+static inline bool
+xfs_verify_agno_agbno(
 	struct xfs_mount	*mp,
 	xfs_agnumber_t		agno,
 	xfs_agblock_t		agbno)
@@ -59,7 +47,7 @@ xfs_verify_fsbno(
 
 	if (agno >= mp->m_sb.sb_agcount)
 		return false;
-	return xfs_verify_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno));
+	return xfs_verify_agno_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno));
 }
 
 /*
@@ -85,40 +73,12 @@ xfs_verify_fsbext(
 		XFS_FSB_TO_AGNO(mp, fsbno + len - 1);
 }
 
-/* Calculate the first and last possible inode number in an AG. */
-inline void
-xfs_agino_range(
-	struct xfs_mount	*mp,
-	xfs_agnumber_t		agno,
-	xfs_agino_t		*first,
-	xfs_agino_t		*last)
-{
-	xfs_agblock_t		bno;
-	xfs_agblock_t		eoag;
-
-	eoag = xfs_ag_block_count(mp, agno);
-
-	/*
-	 * Calculate the first inode, which will be in the first
-	 * cluster-aligned block after the AGFL.
-	 */
-	bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align);
-	*first = XFS_AGB_TO_AGINO(mp, bno);
-
-	/*
-	 * Calculate the last inode, which will be at the end of the
-	 * last (aligned) cluster that can be allocated in the AG.
-	 */
-	bno = round_down(eoag, M_IGEO(mp)->cluster_align);
-	*last = XFS_AGB_TO_AGINO(mp, bno) - 1;
-}
-
 /*
  * Verify that an AG inode number pointer neither points outside the AG
  * nor points at static metadata.
  */
-inline bool
-xfs_verify_agino(
+static inline bool
+xfs_verify_agno_agino(
 	struct xfs_mount	*mp,
 	xfs_agnumber_t		agno,
 	xfs_agino_t		agino)
@@ -130,19 +90,6 @@ xfs_verify_agino(
 	return agino >= first && agino <= last;
 }
 
-/*
- * Verify that an AG inode number pointer neither points outside the AG
- * nor points at static metadata, or is NULLAGINO.
- */
-bool
-xfs_verify_agino_or_null(
-	struct xfs_mount	*mp,
-	xfs_agnumber_t		agno,
-	xfs_agino_t		agino)
-{
-	return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino);
-}
-
 /*
  * Verify that an FS inode number pointer neither points outside the
  * filesystem nor points at static AG metadata.
@@ -159,7 +106,7 @@ xfs_verify_ino(
 		return false;
 	if (XFS_AGINO_TO_INO(mp, agno, agino) != ino)
 		return false;
-	return xfs_verify_agino(mp, agno, agino);
+	return xfs_verify_agno_agino(mp, agno, agino);
 }
 
 /* Is this an internal inode number? */
@@ -229,12 +176,8 @@ xfs_icount_range(
 	/* root, rtbitmap, rtsum all live in the first chunk */
 	*min = XFS_INODES_PER_CHUNK;
 
-	for_each_perag(mp, agno, pag) {
-		xfs_agino_t	first, last;
-
-		xfs_agino_range(mp, agno, &first, &last);
-		nr_inos += last - first + 1;
-	}
+	for_each_perag(mp, agno, pag)
+		nr_inos += pag->agino_max - pag->agino_min + 1;
 	*max = nr_inos;
 }
 
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 373f64a492a4..a6b7d98cf68f 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -179,19 +179,10 @@ enum xfs_ag_resv_type {
  */
 struct xfs_mount;
 
-xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno);
-bool xfs_verify_agbno(struct xfs_mount *mp, xfs_agnumber_t agno,
-		xfs_agblock_t agbno);
 bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
 bool xfs_verify_fsbext(struct xfs_mount *mp, xfs_fsblock_t fsbno,
 		xfs_fsblock_t len);
 
-void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
-		xfs_agino_t *first, xfs_agino_t *last);
-bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno,
-		xfs_agino_t agino);
-bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno,
-		xfs_agino_t agino);
 bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
 bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
 bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 90aebfe9dc5f..b7b838bd4ba4 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -541,16 +541,16 @@ xchk_agf(
 
 	/* Check the AG length */
 	eoag = be32_to_cpu(agf->agf_length);
-	if (eoag != xfs_ag_block_count(mp, agno))
+	if (eoag != pag->block_count)
 		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
 
 	/* Check the AGF btree roots and levels */
 	agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]);
-	if (!xfs_verify_agbno(mp, agno, agbno))
+	if (!xfs_verify_agbno(pag, agbno))
 		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
 
 	agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]);
-	if (!xfs_verify_agbno(mp, agno, agbno))
+	if (!xfs_verify_agbno(pag, agbno))
 		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
 
 	level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
@@ -563,7 +563,7 @@ xchk_agf(
 
 	if (xfs_has_rmapbt(mp)) {
 		agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]);
-		if (!xfs_verify_agbno(mp, agno, agbno))
+		if (!xfs_verify_agbno(pag, agbno))
 			xchk_block_set_corrupt(sc, sc->sa.agf_bp);
 
 		level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
@@ -573,7 +573,7 @@ xchk_agf(
 
 	if (xfs_has_reflink(mp)) {
 		agbno = be32_to_cpu(agf->agf_refcount_root);
-		if (!xfs_verify_agbno(mp, agno, agbno))
+		if (!xfs_verify_agbno(pag, agbno))
 			xchk_block_set_corrupt(sc, sc->sa.agf_bp);
 
 		level = be32_to_cpu(agf->agf_refcount_level);
@@ -639,9 +639,8 @@ xchk_agfl_block(
 {
 	struct xchk_agfl_info	*sai = priv;
 	struct xfs_scrub	*sc = sai->sc;
-	xfs_agnumber_t		agno = sc->sa.pag->pag_agno;
 
-	if (xfs_verify_agbno(mp, agno, agbno) &&
+	if (xfs_verify_agbno(sc->sa.pag, agbno) &&
 	    sai->nr_entries < sai->sz_entries)
 		sai->entries[sai->nr_entries++] = agbno;
 	else
@@ -871,12 +870,12 @@ xchk_agi(
 
 	/* Check the AG length */
 	eoag = be32_to_cpu(agi->agi_length);
-	if (eoag != xfs_ag_block_count(mp, agno))
+	if (eoag != pag->block_count)
 		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
 
 	/* Check btree roots and levels */
 	agbno = be32_to_cpu(agi->agi_root);
-	if (!xfs_verify_agbno(mp, agno, agbno))
+	if (!xfs_verify_agbno(pag, agbno))
 		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
 
 	level = be32_to_cpu(agi->agi_level);
@@ -885,7 +884,7 @@ xchk_agi(
 
 	if (xfs_has_finobt(mp)) {
 		agbno = be32_to_cpu(agi->agi_free_root);
-		if (!xfs_verify_agbno(mp, agno, agbno))
+		if (!xfs_verify_agbno(pag, agbno))
 			xchk_block_set_corrupt(sc, sc->sa.agi_bp);
 
 		level = be32_to_cpu(agi->agi_free_level);
@@ -902,17 +901,17 @@ xchk_agi(
 
 	/* Check inode pointers */
 	agino = be32_to_cpu(agi->agi_newino);
-	if (!xfs_verify_agino_or_null(mp, agno, agino))
+	if (!xfs_verify_agino_or_null(pag, agino))
 		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
 
 	agino = be32_to_cpu(agi->agi_dirino);
-	if (!xfs_verify_agino_or_null(mp, agno, agino))
+	if (!xfs_verify_agino_or_null(pag, agino))
 		xchk_block_set_corrupt(sc, sc->sa.agi_bp);
 
 	/* Check unlinked inode buckets */
 	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
 		agino = be32_to_cpu(agi->agi_unlinked[i]);
-		if (!xfs_verify_agino_or_null(mp, agno, agino))
+		if (!xfs_verify_agino_or_null(pag, agino))
 			xchk_block_set_corrupt(sc, sc->sa.agi_bp);
 	}
 
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 6da7f2ca77de..1b0b4e243f77 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -106,7 +106,7 @@ xrep_agf_check_agfl_block(
 {
 	struct xfs_scrub	*sc = priv;
 
-	if (!xfs_verify_agbno(mp, sc->sa.pag->pag_agno, agbno))
+	if (!xfs_verify_agbno(sc->sa.pag, agbno))
 		return -EFSCORRUPTED;
 	return 0;
 }
@@ -130,10 +130,7 @@ xrep_check_btree_root(
 	struct xfs_scrub		*sc,
 	struct xrep_find_ag_btree	*fab)
 {
-	struct xfs_mount		*mp = sc->mp;
-	xfs_agnumber_t			agno = sc->sm->sm_agno;
-
-	return xfs_verify_agbno(mp, agno, fab->root) &&
+	return xfs_verify_agbno(sc->sa.pag, fab->root) &&
 	       fab->height <= fab->maxlevels;
 }
 
@@ -201,8 +198,7 @@ xrep_agf_init_header(
 	agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
 	agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
 	agf->agf_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
-	agf->agf_length = cpu_to_be32(xfs_ag_block_count(mp,
-							sc->sa.pag->pag_agno));
+	agf->agf_length = cpu_to_be32(sc->sa.pag->block_count);
 	agf->agf_flfirst = old_agf->agf_flfirst;
 	agf->agf_fllast = old_agf->agf_fllast;
 	agf->agf_flcount = old_agf->agf_flcount;
@@ -405,7 +401,7 @@ xrep_agf(
 	 * btrees rooted in the AGF.  If the AGFL contents are obviously bad
 	 * then we'll bail out.
 	 */
-	error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.pag->pag_agno, &agfl_bp);
+	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
 	if (error)
 		return error;
 
@@ -666,8 +662,7 @@ xrep_agfl(
 	 * nothing wrong with the AGF, but all the AG header repair functions
 	 * have this chicken-and-egg problem.
 	 */
-	error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.pag->pag_agno, 0,
-			&agf_bp);
+	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
 	if (error)
 		return error;
 
@@ -742,8 +737,7 @@ xrep_agi_find_btrees(
 	int				error;
 
 	/* Read the AGF. */
-	error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.pag->pag_agno, 0,
-			&agf_bp);
+	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
 	if (error)
 		return error;
 
@@ -782,8 +776,7 @@ xrep_agi_init_header(
 	agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
 	agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
 	agi->agi_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
-	agi->agi_length = cpu_to_be32(xfs_ag_block_count(mp,
-							sc->sa.pag->pag_agno));
+	agi->agi_length = cpu_to_be32(sc->sa.pag->block_count);
 	agi->agi_newino = cpu_to_be32(NULLAGINO);
 	agi->agi_dirino = cpu_to_be32(NULLAGINO);
 	if (xfs_has_crc(mp))
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 87518e1292f8..ab427b4d7fe0 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -93,8 +93,7 @@ xchk_allocbt_rec(
 	struct xchk_btree	*bs,
 	const union xfs_btree_rec *rec)
 {
-	struct xfs_mount	*mp = bs->cur->bc_mp;
-	xfs_agnumber_t		agno = bs->cur->bc_ag.pag->pag_agno;
+	struct xfs_perag	*pag = bs->cur->bc_ag.pag;
 	xfs_agblock_t		bno;
 	xfs_extlen_t		len;
 
@@ -102,8 +101,8 @@ xchk_allocbt_rec(
 	len = be32_to_cpu(rec->alloc.ar_blockcount);
 
 	if (bno + len <= bno ||
-	    !xfs_verify_agbno(mp, agno, bno) ||
-	    !xfs_verify_agbno(mp, agno, bno + len - 1))
+	    !xfs_verify_agbno(pag, bno) ||
+	    !xfs_verify_agbno(pag, bno + len - 1))
 		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
 
 	xchk_allocbt_xref(bs->sc, bno, len);
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 285995ba3947..f0b9cb6506fd 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -377,7 +377,7 @@ xchk_bmapbt_rec(
 	struct xfs_inode	*ip = bs->cur->bc_ino.ip;
 	struct xfs_buf		*bp = NULL;
 	struct xfs_btree_block	*block;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, info->whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, info->whichfork);
 	uint64_t		owner;
 	int			i;
 
@@ -426,7 +426,7 @@ xchk_bmap_btree(
 	struct xchk_bmap_info	*info)
 {
 	struct xfs_owner_info	oinfo;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(sc->ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->ip, whichfork);
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_inode	*ip = sc->ip;
 	struct xfs_btree_cur	*cur;
@@ -478,7 +478,7 @@ xchk_bmap_check_rmap(
 		return 0;
 
 	/* Now look up the bmbt record. */
-	ifp = XFS_IFORK_PTR(sc->ip, sbcri->whichfork);
+	ifp = xfs_ifork_ptr(sc->ip, sbcri->whichfork);
 	if (!ifp) {
 		xchk_fblock_set_corrupt(sc, sbcri->whichfork,
 				rec->rm_offset);
@@ -540,7 +540,7 @@ xchk_bmap_check_ag_rmaps(
 	struct xfs_buf			*agf;
 	int				error;
 
-	error = xfs_alloc_read_agf(sc->mp, sc->tp, pag->pag_agno, 0, &agf);
+	error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf);
 	if (error)
 		return error;
 
@@ -563,7 +563,7 @@ xchk_bmap_check_rmaps(
 	struct xfs_scrub	*sc,
 	int			whichfork)
 {
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(sc->ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->ip, whichfork);
 	struct xfs_perag	*pag;
 	xfs_agnumber_t		agno;
 	bool			zero_size;
@@ -578,7 +578,7 @@ xchk_bmap_check_rmaps(
 	if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK)
 		return 0;
 
-	ASSERT(XFS_IFORK_PTR(sc->ip, whichfork) != NULL);
+	ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL);
 
 	/*
 	 * Only do this for complex maps that are in btree format, or for
@@ -624,7 +624,7 @@ xchk_bmap(
 	struct xchk_bmap_info	info = { NULL };
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_inode	*ip = sc->ip;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	xfs_fileoff_t		endoff;
 	struct xfs_iext_cursor	icur;
 	int			error = 0;
@@ -689,7 +689,7 @@ xchk_bmap(
 
 	/* Scrub extent records. */
 	info.lastoff = 0;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ifp = xfs_ifork_ptr(ip, whichfork);
 	for_each_xfs_iext(ifp, &icur, &irec) {
 		if (xchk_should_terminate(sc, &error) ||
 		    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 39dd46f038fe..2f4519590dc1 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -462,7 +462,7 @@ xchk_btree_check_iroot_minrecs(
 	 */
 	if (bs->cur->bc_btnum == XFS_BTNUM_BMAP &&
 	    bs->cur->bc_ino.whichfork == XFS_DATA_FORK &&
-	    XFS_IFORK_Q(bs->sc->ip))
+	    xfs_inode_has_attr_fork(bs->sc->ip))
 		return false;
 
 	return true;
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 97b54ac3075f..9bbbf20f401b 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -416,15 +416,15 @@ xchk_ag_read_headers(
 	if (!sa->pag)
 		return -ENOENT;
 
-	error = xfs_ialloc_read_agi(mp, sc->tp, agno, &sa->agi_bp);
+	error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp);
 	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
 		return error;
 
-	error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &sa->agf_bp);
+	error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp);
 	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
 		return error;
 
-	error = xfs_alloc_read_agfl(mp, sc->tp, agno, &sa->agfl_bp);
+	error = xfs_alloc_read_agfl(sa->pag, sc->tp, &sa->agfl_bp);
 	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL))
 		return error;
 
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index b962cfbbd92b..84fe3d33d699 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -482,7 +482,7 @@ xchk_da_btree(
 	int				error;
 
 	/* Skip short format data structures; no btree to scan. */
-	if (!xfs_ifork_has_extents(XFS_IFORK_PTR(sc->ip, whichfork)))
+	if (!xfs_ifork_has_extents(xfs_ifork_ptr(sc->ip, whichfork)))
 		return 0;
 
 	/* Set up initial da state. */
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 38897adde7b5..5abb5fdb71d9 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -667,7 +667,7 @@ xchk_directory_blocks(
 {
 	struct xfs_bmbt_irec	got;
 	struct xfs_da_args	args;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
 	struct xfs_mount	*mp = sc->mp;
 	xfs_fileoff_t		leaf_lblk;
 	xfs_fileoff_t		free_lblk;
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 48a6cbdf95d0..6a6f8fe7f87c 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -78,10 +78,10 @@ xchk_fscount_warmup(
 			continue;
 
 		/* Lock both AG headers. */
-		error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp);
+		error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
 		if (error)
 			break;
-		error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp);
+		error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp);
 		if (error)
 			break;
 
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 2e61df3bca83..aa65ec88a0c0 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -8,6 +8,8 @@
 #include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_btree.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
 #include "xfs_ag.h"
 #include "xfs_health.h"
 #include "scrub/scrub.h"
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 00848ee542fb..e1026e07bf94 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -104,13 +104,13 @@ xchk_iallocbt_chunk(
 	xfs_extlen_t			len)
 {
 	struct xfs_mount		*mp = bs->cur->bc_mp;
-	xfs_agnumber_t			agno = bs->cur->bc_ag.pag->pag_agno;
+	struct xfs_perag		*pag = bs->cur->bc_ag.pag;
 	xfs_agblock_t			bno;
 
 	bno = XFS_AGINO_TO_AGBNO(mp, agino);
 	if (bno + len <= bno ||
-	    !xfs_verify_agbno(mp, agno, bno) ||
-	    !xfs_verify_agbno(mp, agno, bno + len - 1))
+	    !xfs_verify_agbno(pag, bno) ||
+	    !xfs_verify_agbno(pag, bno + len - 1))
 		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
 
 	xchk_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len);
@@ -421,10 +421,10 @@ xchk_iallocbt_rec(
 	const union xfs_btree_rec	*rec)
 {
 	struct xfs_mount		*mp = bs->cur->bc_mp;
+	struct xfs_perag		*pag = bs->cur->bc_ag.pag;
 	struct xchk_iallocbt		*iabt = bs->private;
 	struct xfs_inobt_rec_incore	irec;
 	uint64_t			holes;
-	xfs_agnumber_t			agno = bs->cur->bc_ag.pag->pag_agno;
 	xfs_agino_t			agino;
 	xfs_extlen_t			len;
 	int				holecount;
@@ -446,8 +446,8 @@ xchk_iallocbt_rec(
 
 	agino = irec.ir_startino;
 	/* Record has to be properly aligned within the AG. */
-	if (!xfs_verify_agino(mp, agno, agino) ||
-	    !xfs_verify_agino(mp, agno, agino + XFS_INODES_PER_CHUNK - 1)) {
+	if (!xfs_verify_agino(pag, agino) ||
+	    !xfs_verify_agino(pag, agino + XFS_INODES_PER_CHUNK - 1)) {
 		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
 		goto out;
 	}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 3c7506c7553c..21b4c9006859 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -185,7 +185,7 @@ xchk_quota_data_fork(
 
 	/* Check for data fork problems that apply only to quota files. */
 	max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk;
-	ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
+	ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
 	for_each_xfs_iext(ifp, &icur, &irec) {
 		if (xchk_should_terminate(sc, &error))
 			break;
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 2744eecdbaf0..c68b767dc08f 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -13,6 +13,8 @@
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
 #include "xfs_ag.h"
 
 /*
@@ -332,9 +334,8 @@ xchk_refcountbt_rec(
 	struct xchk_btree	*bs,
 	const union xfs_btree_rec *rec)
 {
-	struct xfs_mount	*mp = bs->cur->bc_mp;
 	xfs_agblock_t		*cow_blocks = bs->private;
-	xfs_agnumber_t		agno = bs->cur->bc_ag.pag->pag_agno;
+	struct xfs_perag	*pag = bs->cur->bc_ag.pag;
 	xfs_agblock_t		bno;
 	xfs_extlen_t		len;
 	xfs_nlink_t		refcount;
@@ -354,8 +355,8 @@ xchk_refcountbt_rec(
 	/* Check the extent. */
 	bno &= ~XFS_REFC_COW_START;
 	if (bno + len <= bno ||
-	    !xfs_verify_agbno(mp, agno, bno) ||
-	    !xfs_verify_agbno(mp, agno, bno + len - 1))
+	    !xfs_verify_agbno(pag, bno) ||
+	    !xfs_verify_agbno(pag, bno + len - 1))
 		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
 
 	if (refcount == 0)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 1e7b6b209ee8..c18bd039fce9 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -199,7 +199,7 @@ xrep_calc_ag_resblks(
 		icount = pag->pagi_count;
 	} else {
 		/* Try to get the actual counters from disk. */
-		error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp);
+		error = xfs_ialloc_read_agi(pag, NULL, &bp);
 		if (!error) {
 			icount = pag->pagi_count;
 			xfs_buf_relse(bp);
@@ -207,9 +207,9 @@ xrep_calc_ag_resblks(
 	}
 
 	/* Now grab the block counters from the AGF. */
-	error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
+	error = xfs_alloc_read_agf(pag, NULL, 0, &bp);
 	if (error) {
-		aglen = xfs_ag_block_count(mp, sm->sm_agno);
+		aglen = pag->block_count;
 		freelen = aglen;
 		usedlen = aglen;
 	} else {
@@ -220,25 +220,22 @@ xrep_calc_ag_resblks(
 		usedlen = aglen - freelen;
 		xfs_buf_relse(bp);
 	}
-	xfs_perag_put(pag);
 
 	/* If the icount is impossible, make some worst-case assumptions. */
 	if (icount == NULLAGINO ||
-	    !xfs_verify_agino(mp, sm->sm_agno, icount)) {
-		xfs_agino_t	first, last;
-
-		xfs_agino_range(mp, sm->sm_agno, &first, &last);
-		icount = last - first + 1;
+	    !xfs_verify_agino(pag, icount)) {
+		icount = pag->agino_max - pag->agino_min + 1;
 	}
 
 	/* If the block counts are impossible, make worst-case assumptions. */
 	if (aglen == NULLAGBLOCK ||
-	    aglen != xfs_ag_block_count(mp, sm->sm_agno) ||
+	    aglen != pag->block_count ||
 	    freelen >= aglen) {
-		aglen = xfs_ag_block_count(mp, sm->sm_agno);
+		aglen = pag->block_count;
 		freelen = aglen;
 		usedlen = aglen;
 	}
+	xfs_perag_put(pag);
 
 	trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
 			freelen, usedlen);
@@ -300,13 +297,13 @@ xrep_alloc_ag_block(
 	switch (resv) {
 	case XFS_AG_RESV_AGFL:
 	case XFS_AG_RESV_RMAPBT:
-		error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1);
+		error = xfs_alloc_get_freelist(sc->sa.pag, sc->tp,
+				sc->sa.agf_bp, &bno, 1);
 		if (error)
 			return error;
 		if (bno == NULLAGBLOCK)
 			return -ENOSPC;
-		xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno,
-				1, false);
+		xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false);
 		*fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno);
 		if (resv == XFS_AG_RESV_RMAPBT)
 			xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno);
@@ -457,16 +454,19 @@ xrep_invalidate_blocks(
 	 * assume it's owned by someone else.
 	 */
 	for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
+		int		error;
+
 		/* Skip AG headers and post-EOFS blocks */
 		if (!xfs_verify_fsbno(sc->mp, fsbno))
 			continue;
-		bp = xfs_buf_incore(sc->mp->m_ddev_targp,
+		error = xfs_buf_incore(sc->mp->m_ddev_targp,
 				XFS_FSB_TO_DADDR(sc->mp, fsbno),
-				XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK);
-		if (bp) {
-			xfs_trans_bjoin(sc->tp, bp);
-			xfs_trans_binval(sc->tp, bp);
-		}
+				XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp);
+		if (error)
+			continue;
+
+		xfs_trans_bjoin(sc->tp, bp);
+		xfs_trans_binval(sc->tp, bp);
 	}
 
 	return 0;
@@ -516,8 +516,8 @@ xrep_put_freelist(
 		return error;
 
 	/* Put the block on the AGFL. */
-	error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
-			agbno, 0);
+	error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
+			sc->sa.agfl_bp, agbno, 0);
 	if (error)
 		return error;
 	xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
@@ -536,13 +536,12 @@ xrep_reap_block(
 {
 	struct xfs_btree_cur		*cur;
 	struct xfs_buf			*agf_bp = NULL;
-	xfs_agnumber_t			agno;
 	xfs_agblock_t			agbno;
 	bool				has_other_rmap;
 	int				error;
 
-	agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
 	agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
+	ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
 
 	/*
 	 * If we are repairing per-inode metadata, we need to read in the AGF
@@ -550,7 +549,7 @@ xrep_reap_block(
 	 * the AGF buffer that the setup functions already grabbed.
 	 */
 	if (sc->ip) {
-		error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp);
+		error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
 		if (error)
 			return error;
 	} else {
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index 8dae0345c7df..229826b2e1c0 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -92,7 +92,7 @@ xchk_rmapbt_rec(
 {
 	struct xfs_mount	*mp = bs->cur->bc_mp;
 	struct xfs_rmap_irec	irec;
-	xfs_agnumber_t		agno = bs->cur->bc_ag.pag->pag_agno;
+	struct xfs_perag	*pag = bs->cur->bc_ag.pag;
 	bool			non_inode;
 	bool			is_unwritten;
 	bool			is_bmbt;
@@ -121,8 +121,8 @@ xchk_rmapbt_rec(
 		 * Otherwise we must point somewhere past the static metadata
 		 * but before the end of the FS.  Run the regular check.
 		 */
-		if (!xfs_verify_agbno(mp, agno, irec.rm_startblock) ||
-		    !xfs_verify_agbno(mp, agno, irec.rm_startblock +
+		if (!xfs_verify_agbno(pag, irec.rm_startblock) ||
+		    !xfs_verify_agbno(pag, irec.rm_startblock +
 				irec.rm_blockcount - 1))
 			xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
 	}
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 599ee277bba2..75311f8daeeb 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -41,7 +41,7 @@ xchk_symlink(
 
 	if (!S_ISLNK(VFS_I(ip)->i_mode))
 		return -ENOENT;
-	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
 	len = ip->i_disk_size;
 
 	/* Plausible size? */
@@ -52,8 +52,8 @@ xchk_symlink(
 
 	/* Inline symlink? */
 	if (ifp->if_format == XFS_DINODE_FMT_LOCAL) {
-		if (len > XFS_IFORK_DSIZE(ip) ||
-		    len > strnlen(ifp->if_u1.if_data, XFS_IFORK_DSIZE(ip)))
+		if (len > xfs_inode_data_fork_size(ip) ||
+		    len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip)))
 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
 		goto out;
 	}
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 27265771f247..5db87b34fb6e 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -158,6 +158,7 @@ xfs_attr3_node_inactive(
 	}
 	child_fsb = be32_to_cpu(ichdr.btree[0].before);
 	xfs_trans_brelse(*trans, bp);	/* no locks for later trans */
+	bp = NULL;
 
 	/*
 	 * If this is the node level just above the leaves, simply loop
@@ -211,12 +212,8 @@ xfs_attr3_node_inactive(
 				&child_bp);
 		if (error)
 			return error;
-		error = bp->b_error;
-		if (error) {
-			xfs_trans_brelse(*trans, child_bp);
-			return error;
-		}
 		xfs_trans_binval(*trans, child_bp);
+		child_bp = NULL;
 
 		/*
 		 * If we're not done, re-read the parent to get the next
@@ -233,6 +230,7 @@ xfs_attr3_node_inactive(
 						  bp->b_addr);
 			child_fsb = be32_to_cpu(phdr.btree[i + 1].before);
 			xfs_trans_brelse(*trans, bp);
+			bp = NULL;
 		}
 		/*
 		 * Atomically commit the whole invalidate stuff.
@@ -338,7 +336,7 @@ xfs_attr_inactive(
 	ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
 
 	xfs_ilock(dp, lock_mode);
-	if (!XFS_IFORK_Q(dp))
+	if (!xfs_inode_has_attr_fork(dp))
 		goto out_destroy_fork;
 	xfs_iunlock(dp, lock_mode);
 
@@ -351,7 +349,7 @@ xfs_attr_inactive(
 	lock_mode = XFS_ILOCK_EXCL;
 	xfs_ilock(dp, lock_mode);
 
-	if (!XFS_IFORK_Q(dp))
+	if (!xfs_inode_has_attr_fork(dp))
 		goto out_cancel;
 
 	/*
@@ -362,12 +360,11 @@ xfs_attr_inactive(
 
 	/*
 	 * Invalidate and truncate the attribute fork extents. Make sure the
-	 * fork actually has attributes as otherwise the invalidation has no
+	 * fork actually has xattr blocks as otherwise the invalidation has no
 	 * blocks to read and returns an error. In this case, just do the fork
 	 * removal below.
 	 */
-	if (xfs_inode_hasattr(dp) &&
-	    dp->i_afp->if_format != XFS_DINODE_FMT_LOCAL) {
+	if (dp->i_af.if_nextents > 0) {
 		error = xfs_attr3_root_inactive(&trans, dp);
 		if (error)
 			goto out_cancel;
@@ -388,11 +385,7 @@ out_cancel:
 	xfs_trans_cancel(trans);
 out_destroy_fork:
 	/* kill the in-core attr fork before we drop the inode lock */
-	if (dp->i_afp) {
-		xfs_idestroy_fork(dp->i_afp);
-		kmem_cache_free(xfs_ifork_cache, dp->i_afp);
-		dp->i_afp = NULL;
-	}
+	xfs_ifork_zap_attr(dp);
 	if (lock_mode)
 		xfs_iunlock(dp, lock_mode);
 	return error;
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 90a14e85e76d..99bbbe1a0e44 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -61,8 +61,7 @@ xfs_attr_shortform_list(
 	int				sbsize, nsbuf, count, i;
 	int				error = 0;
 
-	ASSERT(dp->i_afp != NULL);
-	sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
+	sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
 	ASSERT(sf != NULL);
 	if (!sf->hdr.count)
 		return 0;
@@ -80,7 +79,7 @@ xfs_attr_shortform_list(
 	 */
 	if (context->bufsize == 0 ||
 	    (XFS_ISRESET_CURSOR(cursor) &&
-	     (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
+	     (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) {
 		for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
 			if (XFS_IS_CORRUPT(context->dp->i_mount,
 					   !xfs_attr_namecheck(sfe->nameval,
@@ -121,7 +120,7 @@ xfs_attr_shortform_list(
 	for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
 		if (unlikely(
 		    ((char *)sfe < (char *)sf) ||
-		    ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
+		    ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) {
 			XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
 					     XFS_ERRLEVEL_LOW,
 					     context->dp->i_mount, sfe,
@@ -513,7 +512,7 @@ xfs_attr_list_ilocked(
 	 */
 	if (!xfs_inode_hasattr(dp))
 		return 0;
-	if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL)
+	if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
 		return xfs_attr_shortform_list(context);
 	if (xfs_attr_is_leaf(dp))
 		return xfs_attr_leaf_list(context);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 85e1a26c92e8..04d0c2bff67c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -256,7 +256,7 @@ xfs_bmap_count_blocks(
 	xfs_filblks_t		*count)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_btree_cur	*cur;
 	xfs_extlen_t		btblocks = 0;
 	int			error;
@@ -439,29 +439,28 @@ xfs_getbmap(
 		whichfork = XFS_COW_FORK;
 	else
 		whichfork = XFS_DATA_FORK;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
 
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	switch (whichfork) {
 	case XFS_ATTR_FORK:
-		if (!XFS_IFORK_Q(ip))
-			goto out_unlock_iolock;
+		lock = xfs_ilock_attr_map_shared(ip);
+		if (!xfs_inode_has_attr_fork(ip))
+			goto out_unlock_ilock;
 
 		max_len = 1LL << 32;
-		lock = xfs_ilock_attr_map_shared(ip);
 		break;
 	case XFS_COW_FORK:
+		lock = XFS_ILOCK_SHARED;
+		xfs_ilock(ip, lock);
+
 		/* No CoW fork? Just return */
-		if (!ifp)
-			goto out_unlock_iolock;
+		if (!xfs_ifork_ptr(ip, whichfork))
+			goto out_unlock_ilock;
 
 		if (xfs_get_cowextsz_hint(ip))
 			max_len = mp->m_super->s_maxbytes;
 		else
 			max_len = XFS_ISIZE(ip);
-
-		lock = XFS_ILOCK_SHARED;
-		xfs_ilock(ip, lock);
 		break;
 	case XFS_DATA_FORK:
 		if (!(iflags & BMV_IF_DELALLOC) &&
@@ -491,6 +490,8 @@ xfs_getbmap(
 		break;
 	}
 
+	ifp = xfs_ifork_ptr(ip, whichfork);
+
 	switch (ifp->if_format) {
 	case XFS_DINODE_FMT_EXTENTS:
 	case XFS_DINODE_FMT_BTREE:
@@ -1320,8 +1321,8 @@ xfs_swap_extents_check_format(
 	 * extent format...
 	 */
 	if (tifp->if_format == XFS_DINODE_FMT_BTREE) {
-		if (XFS_IFORK_Q(ip) &&
-		    XFS_BMAP_BMDR_SPACE(tifp->if_broot) > XFS_IFORK_BOFF(ip))
+		if (xfs_inode_has_attr_fork(ip) &&
+		    XFS_BMAP_BMDR_SPACE(tifp->if_broot) > xfs_inode_fork_boff(ip))
 			return -EINVAL;
 		if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
 			return -EINVAL;
@@ -1329,8 +1330,8 @@ xfs_swap_extents_check_format(
 
 	/* Reciprocal target->temp btree format checks */
 	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
-		if (XFS_IFORK_Q(tip) &&
-		    XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
+		if (xfs_inode_has_attr_fork(tip) &&
+		    XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > xfs_inode_fork_boff(tip))
 			return -EINVAL;
 		if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
 			return -EINVAL;
@@ -1506,15 +1507,15 @@ xfs_swap_extent_forks(
 	/*
 	 * Count the number of extended attribute blocks
 	 */
-	if (XFS_IFORK_Q(ip) && ip->i_afp->if_nextents > 0 &&
-	    ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) {
+	if (xfs_inode_has_attr_fork(ip) && ip->i_af.if_nextents > 0 &&
+	    ip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
 		error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk,
 				&aforkblks);
 		if (error)
 			return error;
 	}
-	if (XFS_IFORK_Q(tip) && tip->i_afp->if_nextents > 0 &&
-	    tip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) {
+	if (xfs_inode_has_attr_fork(tip) && tip->i_af.if_nextents > 0 &&
+	    tip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
 		error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk,
 				&taforkblks);
 		if (error)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 5e8f40d8c052..584afe076923 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -21,7 +21,7 @@
 #include "xfs_error.h"
 #include "xfs_ag.h"
 
-static struct kmem_cache *xfs_buf_cache;
+struct kmem_cache *xfs_buf_cache;
 
 /*
  * Locking orders
@@ -294,6 +294,16 @@ xfs_buf_free_pages(
 	bp->b_flags &= ~_XBF_PAGES;
 }
 
+static void
+xfs_buf_free_callback(
+	struct callback_head	*cb)
+{
+	struct xfs_buf		*bp = container_of(cb, struct xfs_buf, b_rcu);
+
+	xfs_buf_free_maps(bp);
+	kmem_cache_free(xfs_buf_cache, bp);
+}
+
 static void
 xfs_buf_free(
 	struct xfs_buf		*bp)
@@ -307,8 +317,7 @@ xfs_buf_free(
 	else if (bp->b_flags & _XBF_KMEM)
 		kmem_free(bp->b_addr);
 
-	xfs_buf_free_maps(bp);
-	kmem_cache_free(xfs_buf_cache, bp);
+	call_rcu(&bp->b_rcu, xfs_buf_free_callback);
 }
 
 static int
@@ -503,100 +512,45 @@ xfs_buf_hash_destroy(
 	rhashtable_destroy(&pag->pag_buf_hash);
 }
 
-/*
- * Look up a buffer in the buffer cache and return it referenced and locked
- * in @found_bp.
- *
- * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the
- * cache.
- *
- * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return
- * -EAGAIN if we fail to lock it.
- *
- * Return values are:
- *	-EFSCORRUPTED if have been supplied with an invalid address
- *	-EAGAIN on trylock failure
- *	-ENOENT if we fail to find a match and @new_bp was NULL
- *	0, with @found_bp:
- *		- @new_bp if we inserted it into the cache
- *		- the buffer we found and locked.
- */
 static int
-xfs_buf_find(
+xfs_buf_map_verify(
 	struct xfs_buftarg	*btp,
-	struct xfs_buf_map	*map,
-	int			nmaps,
-	xfs_buf_flags_t		flags,
-	struct xfs_buf		*new_bp,
-	struct xfs_buf		**found_bp)
+	struct xfs_buf_map	*map)
 {
-	struct xfs_perag	*pag;
-	struct xfs_buf		*bp;
-	struct xfs_buf_map	cmap = { .bm_bn = map[0].bm_bn };
 	xfs_daddr_t		eofs;
-	int			i;
-
-	*found_bp = NULL;
-
-	for (i = 0; i < nmaps; i++)
-		cmap.bm_len += map[i].bm_len;
 
 	/* Check for IOs smaller than the sector size / not sector aligned */
-	ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
-	ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
+	ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
+	ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
 
 	/*
 	 * Corrupted block numbers can get through to here, unfortunately, so we
 	 * have to check that the buffer falls within the filesystem bounds.
 	 */
 	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
-	if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
+	if (map->bm_bn < 0 || map->bm_bn >= eofs) {
 		xfs_alert(btp->bt_mount,
 			  "%s: daddr 0x%llx out of range, EOFS 0x%llx",
-			  __func__, cmap.bm_bn, eofs);
+			  __func__, map->bm_bn, eofs);
 		WARN_ON(1);
 		return -EFSCORRUPTED;
 	}
-
-	pag = xfs_perag_get(btp->bt_mount,
-			    xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
-
-	spin_lock(&pag->pag_buf_lock);
-	bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
-				    xfs_buf_hash_params);
-	if (bp) {
-		atomic_inc(&bp->b_hold);
-		goto found;
-	}
-
-	/* No match found */
-	if (!new_bp) {
-		XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
-		spin_unlock(&pag->pag_buf_lock);
-		xfs_perag_put(pag);
-		return -ENOENT;
-	}
-
-	/* the buffer keeps the perag reference until it is freed */
-	new_bp->b_pag = pag;
-	rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head,
-			       xfs_buf_hash_params);
-	spin_unlock(&pag->pag_buf_lock);
-	*found_bp = new_bp;
 	return 0;
+}
 
-found:
-	spin_unlock(&pag->pag_buf_lock);
-	xfs_perag_put(pag);
-
-	if (!xfs_buf_trylock(bp)) {
-		if (flags & XBF_TRYLOCK) {
-			xfs_buf_rele(bp);
-			XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
+static int
+xfs_buf_find_lock(
+	struct xfs_buf          *bp,
+	xfs_buf_flags_t		flags)
+{
+	if (flags & XBF_TRYLOCK) {
+		if (!xfs_buf_trylock(bp)) {
+			XFS_STATS_INC(bp->b_mount, xb_busy_locked);
 			return -EAGAIN;
 		}
+	} else {
 		xfs_buf_lock(bp);
-		XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
+		XFS_STATS_INC(bp->b_mount, xb_get_locked_waited);
 	}
 
 	/*
@@ -609,57 +563,59 @@ found:
 		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
 		bp->b_ops = NULL;
 	}
-
-	trace_xfs_buf_find(bp, flags, _RET_IP_);
-	XFS_STATS_INC(btp->bt_mount, xb_get_locked);
-	*found_bp = bp;
 	return 0;
 }
 
-struct xfs_buf *
-xfs_buf_incore(
-	struct xfs_buftarg	*target,
-	xfs_daddr_t		blkno,
-	size_t			numblks,
-	xfs_buf_flags_t		flags)
+static inline int
+xfs_buf_lookup(
+	struct xfs_perag	*pag,
+	struct xfs_buf_map	*map,
+	xfs_buf_flags_t		flags,
+	struct xfs_buf		**bpp)
 {
-	struct xfs_buf		*bp;
+	struct xfs_buf          *bp;
 	int			error;
-	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
 
-	error = xfs_buf_find(target, &map, 1, flags, NULL, &bp);
-	if (error)
-		return NULL;
-	return bp;
+	rcu_read_lock();
+	bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params);
+	if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
+		rcu_read_unlock();
+		return -ENOENT;
+	}
+	rcu_read_unlock();
+
+	error = xfs_buf_find_lock(bp, flags);
+	if (error) {
+		xfs_buf_rele(bp);
+		return error;
+	}
+
+	trace_xfs_buf_find(bp, flags, _RET_IP_);
+	*bpp = bp;
+	return 0;
 }
 
 /*
- * Assembles a buffer covering the specified range. The code is optimised for
- * cache hits, as metadata intensive workloads will see 3 orders of magnitude
- * more hits than misses.
+ * Insert the new_bp into the hash table. This consumes the perag reference
+ * taken for the lookup regardless of the result of the insert.
  */
-int
-xfs_buf_get_map(
-	struct xfs_buftarg	*target,
+static int
+xfs_buf_find_insert(
+	struct xfs_buftarg	*btp,
+	struct xfs_perag	*pag,
+	struct xfs_buf_map	*cmap,
 	struct xfs_buf_map	*map,
 	int			nmaps,
 	xfs_buf_flags_t		flags,
 	struct xfs_buf		**bpp)
 {
-	struct xfs_buf		*bp;
 	struct xfs_buf		*new_bp;
+	struct xfs_buf		*bp;
 	int			error;
 
-	*bpp = NULL;
-	error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
-	if (!error)
-		goto found;
-	if (error != -ENOENT)
-		return error;
-
-	error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp);
+	error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
 	if (error)
-		return error;
+		goto out_drop_pag;
 
 	/*
 	 * For buffers that fit entirely within a single page, first attempt to
@@ -674,18 +630,94 @@ xfs_buf_get_map(
 			goto out_free_buf;
 	}
 
-	error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
-	if (error)
+	spin_lock(&pag->pag_buf_lock);
+	bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash,
+			&new_bp->b_rhash_head, xfs_buf_hash_params);
+	if (IS_ERR(bp)) {
+		error = PTR_ERR(bp);
+		spin_unlock(&pag->pag_buf_lock);
 		goto out_free_buf;
+	}
+	if (bp) {
+		/* found an existing buffer */
+		atomic_inc(&bp->b_hold);
+		spin_unlock(&pag->pag_buf_lock);
+		error = xfs_buf_find_lock(bp, flags);
+		if (error)
+			xfs_buf_rele(bp);
+		else
+			*bpp = bp;
+		goto out_free_buf;
+	}
 
-	if (bp != new_bp)
-		xfs_buf_free(new_bp);
+	/* The new buffer keeps the perag reference until it is freed. */
+	new_bp->b_pag = pag;
+	spin_unlock(&pag->pag_buf_lock);
+	*bpp = new_bp;
+	return 0;
 
-found:
+out_free_buf:
+	xfs_buf_free(new_bp);
+out_drop_pag:
+	xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * Assembles a buffer covering the specified range. The code is optimised for
+ * cache hits, as metadata intensive workloads will see 3 orders of magnitude
+ * more hits than misses.
+ */
+int
+xfs_buf_get_map(
+	struct xfs_buftarg	*btp,
+	struct xfs_buf_map	*map,
+	int			nmaps,
+	xfs_buf_flags_t		flags,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_perag	*pag;
+	struct xfs_buf		*bp = NULL;
+	struct xfs_buf_map	cmap = { .bm_bn = map[0].bm_bn };
+	int			error;
+	int			i;
+
+	for (i = 0; i < nmaps; i++)
+		cmap.bm_len += map[i].bm_len;
+
+	error = xfs_buf_map_verify(btp, &cmap);
+	if (error)
+		return error;
+
+	pag = xfs_perag_get(btp->bt_mount,
+			    xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
+
+	error = xfs_buf_lookup(pag, &cmap, flags, &bp);
+	if (error && error != -ENOENT)
+		goto out_put_perag;
+
+	/* cache hits always outnumber misses by at least 10:1 */
+	if (unlikely(!bp)) {
+		XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
+
+		if (flags & XBF_INCORE)
+			goto out_put_perag;
+
+		/* xfs_buf_find_insert() consumes the perag reference. */
+		error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
+				flags, &bp);
+		if (error)
+			return error;
+	} else {
+		XFS_STATS_INC(btp->bt_mount, xb_get_locked);
+		xfs_perag_put(pag);
+	}
+
+	/* We do not hold a perag reference anymore. */
 	if (!bp->b_addr) {
 		error = _xfs_buf_map_pages(bp, flags);
 		if (unlikely(error)) {
-			xfs_warn_ratelimited(target->bt_mount,
+			xfs_warn_ratelimited(btp->bt_mount,
 				"%s: failed to map %u pages", __func__,
 				bp->b_page_count);
 			xfs_buf_relse(bp);
@@ -700,12 +732,13 @@ found:
 	if (!(flags & XBF_READ))
 		xfs_buf_ioerror(bp, 0);
 
-	XFS_STATS_INC(target->bt_mount, xb_get);
+	XFS_STATS_INC(btp->bt_mount, xb_get);
 	trace_xfs_buf_get(bp, flags, _RET_IP_);
 	*bpp = bp;
 	return 0;
-out_free_buf:
-	xfs_buf_free(new_bp);
+
+out_put_perag:
+	xfs_perag_put(pag);
 	return error;
 }
 
@@ -2275,29 +2308,6 @@ xfs_buf_delwri_pushbuf(
 	return error;
 }
 
-int __init
-xfs_buf_init(void)
-{
-	xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
-					 SLAB_HWCACHE_ALIGN |
-					 SLAB_RECLAIM_ACCOUNT |
-					 SLAB_MEM_SPREAD,
-					 NULL);
-	if (!xfs_buf_cache)
-		goto out;
-
-	return 0;
-
- out:
-	return -ENOMEM;
-}
-
-void
-xfs_buf_terminate(void)
-{
-	kmem_cache_destroy(xfs_buf_cache);
-}
-
 void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
 {
 	/*
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 1ee3056ff9cf..549c60942208 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -15,6 +15,8 @@
 #include <linux/uio.h>
 #include <linux/list_lru.h>
 
+extern struct kmem_cache *xfs_buf_cache;
+
 /*
  *	Base types
  */
@@ -42,9 +44,11 @@ struct xfs_buf;
 #define _XBF_DELWRI_Q	 (1u << 22)/* buffer on a delwri queue */
 
 /* flags used only as arguments to access routines */
+#define XBF_INCORE	 (1u << 29)/* lookup only, return if found in cache */
 #define XBF_TRYLOCK	 (1u << 30)/* lock requested, but do not wait */
 #define XBF_UNMAPPED	 (1u << 31)/* do not map the buffer */
 
+
 typedef unsigned int xfs_buf_flags_t;
 
 #define XFS_BUF_FLAGS \
@@ -63,6 +67,7 @@ typedef unsigned int xfs_buf_flags_t;
 	{ _XBF_KMEM,		"KMEM" }, \
 	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
 	/* The following interface flags should never be set */ \
+	{ XBF_INCORE,		"INCORE" }, \
 	{ XBF_TRYLOCK,		"TRYLOCK" }, \
 	{ XBF_UNMAPPED,		"UNMAPPED" }
 
@@ -193,13 +198,10 @@ struct xfs_buf {
 	int			b_last_error;
 
 	const struct xfs_buf_ops	*b_ops;
+	struct rcu_head		b_rcu;
 };
 
 /* Finding and Reading Buffers */
-struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target,
-			   xfs_daddr_t blkno, size_t numblks,
-			   xfs_buf_flags_t flags);
-
 int xfs_buf_get_map(struct xfs_buftarg *target, struct xfs_buf_map *map,
 		int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp);
 int xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map,
@@ -209,6 +211,19 @@ void xfs_buf_readahead_map(struct xfs_buftarg *target,
 			       struct xfs_buf_map *map, int nmaps,
 			       const struct xfs_buf_ops *ops);
 
+static inline int
+xfs_buf_incore(
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		blkno,
+	size_t			numblks,
+	xfs_buf_flags_t		flags,
+	struct xfs_buf		**bpp)
+{
+	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+
+	return xfs_buf_get_map(target, &map, 1, XBF_INCORE | flags, bpp);
+}
+
 static inline int
 xfs_buf_get(
 	struct xfs_buftarg	*target,
@@ -294,10 +309,6 @@ extern int xfs_buf_delwri_submit(struct list_head *);
 extern int xfs_buf_delwri_submit_nowait(struct list_head *);
 extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
 
-/* Buffer Daemon Setup Routines */
-extern int xfs_buf_init(void);
-extern void xfs_buf_terminate(void);
-
 static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp)
 {
 	return bp->b_maps[0].bm_bn;
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index a7174a5b3203..e295fc8062d8 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -248,7 +248,7 @@ xfs_dir2_leaf_readbuf(
 	struct xfs_inode	*dp = args->dp;
 	struct xfs_buf		*bp = NULL;
 	struct xfs_da_geometry	*geo = args->geo;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK);
 	struct xfs_bmbt_irec	map;
 	struct blk_plug		plug;
 	xfs_dir2_off_t		new_off;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index c6fe3f6ebb6b..bfc829c07f03 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -45,7 +45,7 @@ xfs_trim_extents(
 	 */
 	xfs_log_force(mp, XFS_LOG_SYNC);
 
-	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+	error = xfs_alloc_read_agf(pag, NULL, 0, &agbp);
 	if (error)
 		goto out_put_perag;
 	agf = agbp->b_addr;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 5a6c3c3c4de2..8fb90da89787 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -549,7 +549,7 @@ xfs_dquot_check_type(
 	 * at the same time.  The non-user quota file can be switched between
 	 * group and project quota uses depending on the mount options, which
 	 * means that we can encounter the other type when we try to load quota
-	 * defaults.  Quotacheck will soon reset the the entire quota file
+	 * defaults.  Quotacheck will soon reset the entire quota file
 	 * (including the root dquot) anyway, but don't log scary corruption
 	 * reports to dmesg.
 	 */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 765be054dffe..27ccfcd82f04 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -11,6 +11,7 @@
 #include "xfs_bit.h"
 #include "xfs_shared.h"
 #include "xfs_mount.h"
+#include "xfs_ag.h"
 #include "xfs_defer.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
@@ -187,12 +188,12 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
 {
 	xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
 	uint i;
-	uint len = sizeof(xfs_efi_log_format_t) + 
-		(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);  
-	uint len32 = sizeof(xfs_efi_log_format_32_t) + 
-		(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t);  
-	uint len64 = sizeof(xfs_efi_log_format_64_t) + 
-		(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t);  
+	uint len = sizeof(xfs_efi_log_format_t) +
+		(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);
+	uint len32 = sizeof(xfs_efi_log_format_32_t) +
+		(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t);
+	uint len64 = sizeof(xfs_efi_log_format_64_t) +
+		(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t);
 
 	if (buf->i_len == len) {
 		memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len);
@@ -551,6 +552,7 @@ xfs_agfl_free_finish_item(
 	xfs_agnumber_t			agno;
 	xfs_agblock_t			agbno;
 	uint				next_extent;
+	struct xfs_perag		*pag;
 
 	free = container_of(item, struct xfs_extent_free_item, xefi_list);
 	ASSERT(free->xefi_blockcount == 1);
@@ -560,9 +562,11 @@ xfs_agfl_free_finish_item(
 
 	trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
 
-	error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+	pag = xfs_perag_get(mp, agno);
+	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
 	if (!error)
 		error = xfs_free_agfl_block(tp, agno, agbno, agbp, &oinfo);
+	xfs_perag_put(pag);
 
 	/*
 	 * Mark the transaction dirty, even on error. This ensures the
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index be9bcf8a1f99..34b21a29c39b 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -126,7 +126,7 @@ xfs_filestream_pick_ag(
 		pag = xfs_perag_get(mp, ag);
 
 		if (!pag->pagf_init) {
-			err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
+			err = xfs_alloc_read_agf(pag, NULL, trylock, NULL);
 			if (err) {
 				if (err != -EAGAIN) {
 					xfs_perag_put(pag);
@@ -181,7 +181,7 @@ next_ag:
 		if (ag != startag)
 			continue;
 
-		/* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */
+		/* Allow sleeping in xfs_alloc_read_agf() on the 2nd pass. */
 		if (trylock != 0) {
 			trylock = 0;
 			continue;
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index bb23199f65c3..d8337274c74d 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -642,8 +642,7 @@ __xfs_getfsmap_datadev(
 			info->agf_bp = NULL;
 		}
 
-		error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0,
-				&info->agf_bp);
+		error = xfs_alloc_read_agf(pag, tp, 0, &info->agf_bp);
 		if (error)
 			break;
 
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index d4a77c53f94b..5fe9af24dfcd 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -41,6 +41,7 @@ xfs_resizefs_init_new_ags(
 	xfs_agnumber_t		oagcount,
 	xfs_agnumber_t		nagcount,
 	xfs_rfsblock_t		delta,
+	struct xfs_perag	*last_pag,
 	bool			*lastag_extended)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
@@ -73,7 +74,7 @@ xfs_resizefs_init_new_ags(
 
 	if (delta) {
 		*lastag_extended = true;
-		error = xfs_ag_extend_space(mp, tp, id, delta);
+		error = xfs_ag_extend_space(last_pag, tp, delta);
 	}
 	return error;
 }
@@ -96,6 +97,7 @@ xfs_growfs_data_private(
 	xfs_agnumber_t		oagcount;
 	struct xfs_trans	*tp;
 	struct aghdr_init_data	id = {};
+	struct xfs_perag	*last_pag;
 
 	nb = in->newblocks;
 	error = xfs_sb_validate_fsb_count(&mp->m_sb, nb);
@@ -128,10 +130,9 @@ xfs_growfs_data_private(
 		return -EINVAL;
 
 	oagcount = mp->m_sb.sb_agcount;
-
 	/* allocate the new per-ag structures */
 	if (nagcount > oagcount) {
-		error = xfs_initialize_perag(mp, nagcount, &nagimax);
+		error = xfs_initialize_perag(mp, nagcount, nb, &nagimax);
 		if (error)
 			return error;
 	} else if (nagcount < oagcount) {
@@ -145,15 +146,17 @@ xfs_growfs_data_private(
 	if (error)
 		return error;
 
+	last_pag = xfs_perag_get(mp, oagcount - 1);
 	if (delta > 0) {
 		error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount,
-						  delta, &lastag_extended);
+				delta, last_pag, &lastag_extended);
 	} else {
 		xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK,
 	"EXPERIMENTAL online shrink feature in use. Use at your own risk!");
 
-		error = xfs_ag_shrink_space(mp, &tp, nagcount - 1, -delta);
+		error = xfs_ag_shrink_space(last_pag, &tp, -delta);
 	}
+	xfs_perag_put(last_pag);
 	if (error)
 		goto out_trans_cancel;
 
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 2609825d53ee..e3b2304bb4d2 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -98,8 +98,9 @@ xfs_inode_alloc(
 	ip->i_ino = ino;
 	ip->i_mount = mp;
 	memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
-	ip->i_afp = NULL;
 	ip->i_cowfp = NULL;
+	memset(&ip->i_af, 0, sizeof(ip->i_af));
+	ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
 	memset(&ip->i_df, 0, sizeof(ip->i_df));
 	ip->i_flags = 0;
 	ip->i_delayed_blks = 0;
@@ -111,6 +112,8 @@ xfs_inode_alloc(
 	INIT_WORK(&ip->i_ioend_work, xfs_end_io);
 	INIT_LIST_HEAD(&ip->i_ioend_list);
 	spin_lock_init(&ip->i_ioend_lock);
+	ip->i_next_unlinked = NULLAGINO;
+	ip->i_prev_unlinked = NULLAGINO;
 
 	return ip;
 }
@@ -130,10 +133,8 @@ xfs_inode_free_callback(
 		break;
 	}
 
-	if (ip->i_afp) {
-		xfs_idestroy_fork(ip->i_afp);
-		kmem_cache_free(xfs_ifork_cache, ip->i_afp);
-	}
+	xfs_ifork_zap_attr(ip);
+
 	if (ip->i_cowfp) {
 		xfs_idestroy_fork(ip->i_cowfp);
 		kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
@@ -912,6 +913,7 @@ reclaim:
 	ip->i_checked = 0;
 	spin_unlock(&ip->i_flags_lock);
 
+	ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
 	XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
@@ -1774,7 +1776,7 @@ xfs_check_delalloc(
 	struct xfs_inode	*ip,
 	int			whichfork)
 {
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_bmbt_irec	got;
 	struct xfs_iext_cursor	icur;
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3e1c62ffa4f7..3022918bf96a 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -20,6 +20,7 @@
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
+#include "xfs_iunlink_item.h"
 #include "xfs_ialloc.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
@@ -125,7 +126,7 @@ xfs_ilock_attr_map_shared(
 {
 	uint			lock_mode = XFS_ILOCK_SHARED;
 
-	if (ip->i_afp && xfs_need_iread_extents(ip->i_afp))
+	if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
 		lock_mode = XFS_ILOCK_EXCL;
 	xfs_ilock(ip, lock_mode);
 	return lock_mode;
@@ -635,7 +636,7 @@ xfs_ip2xflags(
 			flags |= FS_XFLAG_COWEXTSIZE;
 	}
 
-	if (XFS_IFORK_Q(ip))
+	if (xfs_inode_has_attr_fork(ip))
 		flags |= FS_XFLAG_HASATTR;
 	return flags;
 }
@@ -893,7 +894,7 @@ xfs_init_new_inode(
 	 */
 	if (init_xattrs && xfs_has_attr(mp)) {
 		ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
-		ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0);
+		xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
 	}
 
 	/*
@@ -1293,8 +1294,8 @@ xfs_itruncate_clear_reflink_flags(
 
 	if (!xfs_is_reflink_inode(ip))
 		return;
-	dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-	cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	dfork = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	cfork = xfs_ifork_ptr(ip, XFS_COW_FORK);
 	if (dfork->if_bytes == 0 && cfork->if_bytes == 0)
 		ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
 	if (cfork->if_bytes == 0)
@@ -1643,7 +1644,7 @@ xfs_inode_needs_inactive(
 	struct xfs_inode	*ip)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*cow_ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	struct xfs_ifork	*cow_ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
 
 	/*
 	 * If the inode is already free, then there can be nothing
@@ -1762,13 +1763,12 @@ xfs_inactive(
 	 * now.  The code calls a routine that recursively deconstructs the
 	 * attribute fork. If also blows away the in-core attribute fork.
 	 */
-	if (XFS_IFORK_Q(ip)) {
+	if (xfs_inode_has_attr_fork(ip)) {
 		error = xfs_attr_inactive(ip);
 		if (error)
 			goto out;
 	}
 
-	ASSERT(!ip->i_afp);
 	ASSERT(ip->i_forkoff == 0);
 
 	/*
@@ -1801,195 +1801,69 @@ out:
  * because we must walk that list to find the inode that points to the inode
  * being removed from the unlinked hash bucket list.
  *
- * What if we modelled the unlinked list as a collection of records capturing
- * "X.next_unlinked = Y" relations?  If we indexed those records on Y, we'd
- * have a fast way to look up unlinked list predecessors, which avoids the
- * slow list walk.  That's exactly what we do here (in-core) with a per-AG
- * rhashtable.
+ * Hence we keep an in-memory double linked list to link each inode on an
+ * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer
+ * based lists would require having 64 list heads in the perag, one for each
+ * list. This is expensive in terms of memory (think millions of AGs) and cache
+ * misses on lookups. Instead, use the fact that inodes on the unlinked list
+ * must be referenced at the VFS level to keep them on the list and hence we
+ * have an existence guarantee for inodes on the unlinked list.
  *
- * Because this is a backref cache, we ignore operational failures since the
- * iunlink code can fall back to the slow bucket walk.  The only errors that
- * should bubble out are for obviously incorrect situations.
- *
- * All users of the backref cache MUST hold the AGI buffer lock to serialize
- * access or have otherwise provided for concurrency control.
+ * Given we have an existence guarantee, we can use lockless inode cache lookups
+ * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode
+ * for the double linked unlinked list, and we don't need any extra locking to
+ * keep the list safe as all manipulations are done under the AGI buffer lock.
+ * Keeping the list up to date does not require memory allocation, just finding
+ * the XFS inode and updating the next/prev unlinked list aginos.
  */
 
-/* Capture a "X.next_unlinked = Y" relationship. */
-struct xfs_iunlink {
-	struct rhash_head	iu_rhash_head;
-	xfs_agino_t		iu_agino;		/* X */
-	xfs_agino_t		iu_next_unlinked;	/* Y */
-};
-
-/* Unlinked list predecessor lookup hashtable construction */
-static int
-xfs_iunlink_obj_cmpfn(
-	struct rhashtable_compare_arg	*arg,
-	const void			*obj)
-{
-	const xfs_agino_t		*key = arg->key;
-	const struct xfs_iunlink	*iu = obj;
-
-	if (iu->iu_next_unlinked != *key)
-		return 1;
-	return 0;
-}
-
-static const struct rhashtable_params xfs_iunlink_hash_params = {
-	.min_size		= XFS_AGI_UNLINKED_BUCKETS,
-	.key_len		= sizeof(xfs_agino_t),
-	.key_offset		= offsetof(struct xfs_iunlink,
-					   iu_next_unlinked),
-	.head_offset		= offsetof(struct xfs_iunlink, iu_rhash_head),
-	.automatic_shrinking	= true,
-	.obj_cmpfn		= xfs_iunlink_obj_cmpfn,
-};
-
 /*
- * Return X, where X.next_unlinked == @agino.  Returns NULLAGINO if no such
- * relation is found.
+ * Find an inode on the unlinked list. This does not take references to the
+ * inode as we have existence guarantees by holding the AGI buffer lock and that
+ * only unlinked, referenced inodes can be on the unlinked inode list.  If we
+ * don't find the inode in cache, then let the caller handle the situation.
  */
-static xfs_agino_t
-xfs_iunlink_lookup_backref(
+static struct xfs_inode *
+xfs_iunlink_lookup(
 	struct xfs_perag	*pag,
 	xfs_agino_t		agino)
 {
-	struct xfs_iunlink	*iu;
+	struct xfs_inode	*ip;
 
-	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
-			xfs_iunlink_hash_params);
-	return iu ? iu->iu_agino : NULLAGINO;
-}
+	rcu_read_lock();
+	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
 
-/*
- * Take ownership of an iunlink cache entry and insert it into the hash table.
- * If successful, the entry will be owned by the cache; if not, it is freed.
- * Either way, the caller does not own @iu after this call.
- */
-static int
-xfs_iunlink_insert_backref(
-	struct xfs_perag	*pag,
-	struct xfs_iunlink	*iu)
-{
-	int			error;
-
-	error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
-			&iu->iu_rhash_head, xfs_iunlink_hash_params);
 	/*
-	 * Fail loudly if there already was an entry because that's a sign of
-	 * corruption of in-memory data.  Also fail loudly if we see an error
-	 * code we didn't anticipate from the rhashtable code.  Currently we
-	 * only anticipate ENOMEM.
+	 * Inode not in memory or in RCU freeing limbo should not happen.
+	 * Warn about this and let the caller handle the failure.
 	 */
-	if (error) {
-		WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
-		kmem_free(iu);
+	if (WARN_ON_ONCE(!ip || !ip->i_ino)) {
+		rcu_read_unlock();
+		return NULL;
 	}
-	/*
-	 * Absorb any runtime errors that aren't a result of corruption because
-	 * this is a cache and we can always fall back to bucket list scanning.
-	 */
-	if (error != 0 && error != -EEXIST)
-		error = 0;
-	return error;
+	ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM));
+	rcu_read_unlock();
+	return ip;
 }
 
-/* Remember that @prev_agino.next_unlinked = @this_agino. */
+/* Update the prev pointer of the next agino. */
 static int
-xfs_iunlink_add_backref(
+xfs_iunlink_update_backref(
 	struct xfs_perag	*pag,
 	xfs_agino_t		prev_agino,
-	xfs_agino_t		this_agino)
+	xfs_agino_t		next_agino)
 {
-	struct xfs_iunlink	*iu;
+	struct xfs_inode	*ip;
 
-	if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
+	/* No update necessary if we are at the end of the list. */
+	if (next_agino == NULLAGINO)
 		return 0;
 
-	iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
-	iu->iu_agino = prev_agino;
-	iu->iu_next_unlinked = this_agino;
-
-	return xfs_iunlink_insert_backref(pag, iu);
-}
-
-/*
- * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
- * If @next_unlinked is NULLAGINO, we drop the backref and exit.  If there
- * wasn't any such entry then we don't bother.
- */
-static int
-xfs_iunlink_change_backref(
-	struct xfs_perag	*pag,
-	xfs_agino_t		agino,
-	xfs_agino_t		next_unlinked)
-{
-	struct xfs_iunlink	*iu;
-	int			error;
-
-	/* Look up the old entry; if there wasn't one then exit. */
-	iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
-			xfs_iunlink_hash_params);
-	if (!iu)
-		return 0;
-
-	/*
-	 * Remove the entry.  This shouldn't ever return an error, but if we
-	 * couldn't remove the old entry we don't want to add it again to the
-	 * hash table, and if the entry disappeared on us then someone's
-	 * violated the locking rules and we need to fail loudly.  Either way
-	 * we cannot remove the inode because internal state is or would have
-	 * been corrupt.
-	 */
-	error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
-			&iu->iu_rhash_head, xfs_iunlink_hash_params);
-	if (error)
-		return error;
-
-	/* If there is no new next entry just free our item and return. */
-	if (next_unlinked == NULLAGINO) {
-		kmem_free(iu);
-		return 0;
-	}
-
-	/* Update the entry and re-add it to the hash table. */
-	iu->iu_next_unlinked = next_unlinked;
-	return xfs_iunlink_insert_backref(pag, iu);
-}
-
-/* Set up the in-core predecessor structures. */
-int
-xfs_iunlink_init(
-	struct xfs_perag	*pag)
-{
-	return rhashtable_init(&pag->pagi_unlinked_hash,
-			&xfs_iunlink_hash_params);
-}
-
-/* Free the in-core predecessor structures. */
-static void
-xfs_iunlink_free_item(
-	void			*ptr,
-	void			*arg)
-{
-	struct xfs_iunlink	*iu = ptr;
-	bool			*freed_anything = arg;
-
-	*freed_anything = true;
-	kmem_free(iu);
-}
-
-void
-xfs_iunlink_destroy(
-	struct xfs_perag	*pag)
-{
-	bool			freed_anything = false;
-
-	rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
-			xfs_iunlink_free_item, &freed_anything);
-
-	ASSERT(freed_anything == false || xfs_is_shutdown(pag->pag_mount));
+	ip = xfs_iunlink_lookup(pag, next_agino);
+	if (!ip)
+		return -EFSCORRUPTED;
+	ip->i_prev_unlinked = prev_agino;
+	return 0;
 }
 
 /*
@@ -2008,7 +1882,7 @@ xfs_iunlink_update_bucket(
 	xfs_agino_t		old_value;
 	int			offset;
 
-	ASSERT(xfs_verify_agino_or_null(tp->t_mountp, pag->pag_agno, new_agino));
+	ASSERT(xfs_verify_agino_or_null(pag, new_agino));
 
 	old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
 	trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
@@ -2031,88 +1905,53 @@ xfs_iunlink_update_bucket(
 	return 0;
 }
 
-/* Set an on-disk inode's next_unlinked pointer. */
-STATIC void
-xfs_iunlink_update_dinode(
+static int
+xfs_iunlink_insert_inode(
 	struct xfs_trans	*tp,
 	struct xfs_perag	*pag,
-	xfs_agino_t		agino,
-	struct xfs_buf		*ibp,
-	struct xfs_dinode	*dip,
-	struct xfs_imap		*imap,
-	xfs_agino_t		next_agino)
+	struct xfs_buf		*agibp,
+	struct xfs_inode	*ip)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
-	int			offset;
-
-	ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino));
-
-	trace_xfs_iunlink_update_dinode(mp, pag->pag_agno, agino,
-			be32_to_cpu(dip->di_next_unlinked), next_agino);
-
-	dip->di_next_unlinked = cpu_to_be32(next_agino);
-	offset = imap->im_boffset +
-			offsetof(struct xfs_dinode, di_next_unlinked);
-
-	/* need to recalc the inode CRC if appropriate */
-	xfs_dinode_calc_crc(mp, dip);
-	xfs_trans_inode_buf(tp, ibp);
-	xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
-}
-
-/* Set an in-core inode's unlinked pointer and return the old value. */
-STATIC int
-xfs_iunlink_update_inode(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip,
-	struct xfs_perag	*pag,
-	xfs_agino_t		next_agino,
-	xfs_agino_t		*old_next_agino)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_dinode	*dip;
-	struct xfs_buf		*ibp;
-	xfs_agino_t		old_value;
+	struct xfs_agi		*agi = agibp->b_addr;
+	xfs_agino_t		next_agino;
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
 	int			error;
 
-	ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino));
-
-	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp);
-	if (error)
-		return error;
-	dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset);
-
-	/* Make sure the old pointer isn't garbage. */
-	old_value = be32_to_cpu(dip->di_next_unlinked);
-	if (!xfs_verify_agino_or_null(mp, pag->pag_agno, old_value)) {
-		xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
-				sizeof(*dip), __this_address);
-		error = -EFSCORRUPTED;
-		goto out;
+	/*
+	 * Get the index into the agi hash table for the list this inode will
+	 * go on.  Make sure the pointer isn't garbage and that this inode
+	 * isn't already on the list.
+	 */
+	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	if (next_agino == agino ||
+	    !xfs_verify_agino_or_null(pag, next_agino)) {
+		xfs_buf_mark_corrupt(agibp);
+		return -EFSCORRUPTED;
 	}
 
 	/*
-	 * Since we're updating a linked list, we should never find that the
-	 * current pointer is the same as the new value, unless we're
-	 * terminating the list.
+	 * Update the prev pointer in the next inode to point back to this
+	 * inode.
 	 */
-	*old_next_agino = old_value;
-	if (old_value == next_agino) {
-		if (next_agino != NULLAGINO) {
-			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
-					dip, sizeof(*dip), __this_address);
-			error = -EFSCORRUPTED;
-		}
-		goto out;
+	error = xfs_iunlink_update_backref(pag, agino, next_agino);
+	if (error)
+		return error;
+
+	if (next_agino != NULLAGINO) {
+		/*
+		 * There is already another inode in the bucket, so point this
+		 * inode to the current head of the list.
+		 */
+		error = xfs_iunlink_log_inode(tp, ip, pag, next_agino);
+		if (error)
+			return error;
+		ip->i_next_unlinked = next_agino;
 	}
 
-	/* Ok, update the new pointer. */
-	xfs_iunlink_update_dinode(tp, pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
-			ibp, dip, &ip->i_imap, next_agino);
-	return 0;
-out:
-	xfs_trans_brelse(tp, ibp);
-	return error;
+	/* Point the head of the list to point to this inode. */
+	return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
 }
 
 /*
@@ -2129,11 +1968,7 @@ xfs_iunlink(
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 	struct xfs_perag	*pag;
-	struct xfs_agi		*agi;
 	struct xfs_buf		*agibp;
-	xfs_agino_t		next_agino;
-	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
 	int			error;
 
 	ASSERT(VFS_I(ip)->i_nlink == 0);
@@ -2143,166 +1978,80 @@ xfs_iunlink(
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 
 	/* Get the agi buffer first.  It ensures lock ordering on the list. */
-	error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp);
+	error = xfs_read_agi(pag, tp, &agibp);
 	if (error)
 		goto out;
-	agi = agibp->b_addr;
 
-	/*
-	 * Get the index into the agi hash table for the list this inode will
-	 * go on.  Make sure the pointer isn't garbage and that this inode
-	 * isn't already on the list.
-	 */
-	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
-	if (next_agino == agino ||
-	    !xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)) {
-		xfs_buf_mark_corrupt(agibp);
-		error = -EFSCORRUPTED;
-		goto out;
-	}
-
-	if (next_agino != NULLAGINO) {
-		xfs_agino_t		old_agino;
-
-		/*
-		 * There is already another inode in the bucket, so point this
-		 * inode to the current head of the list.
-		 */
-		error = xfs_iunlink_update_inode(tp, ip, pag, next_agino,
-				&old_agino);
-		if (error)
-			goto out;
-		ASSERT(old_agino == NULLAGINO);
-
-		/*
-		 * agino has been unlinked, add a backref from the next inode
-		 * back to agino.
-		 */
-		error = xfs_iunlink_add_backref(pag, agino, next_agino);
-		if (error)
-			goto out;
-	}
-
-	/* Point the head of the list to point to this inode. */
-	error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
+	error = xfs_iunlink_insert_inode(tp, pag, agibp, ip);
 out:
 	xfs_perag_put(pag);
 	return error;
 }
 
-/* Return the imap, dinode pointer, and buffer for an inode. */
-STATIC int
-xfs_iunlink_map_ino(
-	struct xfs_trans	*tp,
-	xfs_agnumber_t		agno,
-	xfs_agino_t		agino,
-	struct xfs_imap		*imap,
-	struct xfs_dinode	**dipp,
-	struct xfs_buf		**bpp)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	int			error;
-
-	imap->im_blkno = 0;
-	error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
-	if (error) {
-		xfs_warn(mp, "%s: xfs_imap returned error %d.",
-				__func__, error);
-		return error;
-	}
-
-	error = xfs_imap_to_bp(mp, tp, imap, bpp);
-	if (error) {
-		xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
-				__func__, error);
-		return error;
-	}
-
-	*dipp = xfs_buf_offset(*bpp, imap->im_boffset);
-	return 0;
-}
-
-/*
- * Walk the unlinked chain from @head_agino until we find the inode that
- * points to @target_agino.  Return the inode number, map, dinode pointer,
- * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
- *
- * @tp, @pag, @head_agino, and @target_agino are input parameters.
- * @agino, @imap, @dipp, and @bpp are all output parameters.
- *
- * Do not call this function if @target_agino is the head of the list.
- */
-STATIC int
-xfs_iunlink_map_prev(
+static int
+xfs_iunlink_remove_inode(
 	struct xfs_trans	*tp,
 	struct xfs_perag	*pag,
-	xfs_agino_t		head_agino,
-	xfs_agino_t		target_agino,
-	xfs_agino_t		*agino,
-	struct xfs_imap		*imap,
-	struct xfs_dinode	**dipp,
-	struct xfs_buf		**bpp)
+	struct xfs_buf		*agibp,
+	struct xfs_inode	*ip)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
-	xfs_agino_t		next_agino;
+	struct xfs_agi		*agi = agibp->b_addr;
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	xfs_agino_t		head_agino;
+	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
 	int			error;
 
-	ASSERT(head_agino != target_agino);
-	*bpp = NULL;
+	trace_xfs_iunlink_remove(ip);
 
-	/* See if our backref cache can find it faster. */
-	*agino = xfs_iunlink_lookup_backref(pag, target_agino);
-	if (*agino != NULLAGINO) {
-		error = xfs_iunlink_map_ino(tp, pag->pag_agno, *agino, imap,
-				dipp, bpp);
-		if (error)
-			return error;
-
-		if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
-			return 0;
-
-		/*
-		 * If we get here the cache contents were corrupt, so drop the
-		 * buffer and fall back to walking the bucket list.
-		 */
-		xfs_trans_brelse(tp, *bpp);
-		*bpp = NULL;
-		WARN_ON_ONCE(1);
+	/*
+	 * Get the index into the agi hash table for the list this inode will
+	 * go on.  Make sure the head pointer isn't garbage.
+	 */
+	head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+	if (!xfs_verify_agino(pag, head_agino)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				agi, sizeof(*agi));
+		return -EFSCORRUPTED;
 	}
 
-	trace_xfs_iunlink_map_prev_fallback(mp, pag->pag_agno);
+	/*
+	 * Set our inode's next_unlinked pointer to NULL and then return
+	 * the old pointer value so that we can update whatever was previous
+	 * to us in the list to point to whatever was next in the list.
+	 */
+	error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO);
+	if (error)
+		return error;
 
-	/* Otherwise, walk the entire bucket until we find it. */
-	next_agino = head_agino;
-	while (next_agino != target_agino) {
-		xfs_agino_t	unlinked_agino;
+	/*
+	 * Update the prev pointer in the next inode to point back to previous
+	 * inode in the chain.
+	 */
+	error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
+			ip->i_next_unlinked);
+	if (error)
+		return error;
 
-		if (*bpp)
-			xfs_trans_brelse(tp, *bpp);
+	if (head_agino != agino) {
+		struct xfs_inode	*prev_ip;
 
-		*agino = next_agino;
-		error = xfs_iunlink_map_ino(tp, pag->pag_agno, next_agino, imap,
-				dipp, bpp);
-		if (error)
-			return error;
+		prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
+		if (!prev_ip)
+			return -EFSCORRUPTED;
 
-		unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
-		/*
-		 * Make sure this pointer is valid and isn't an obvious
-		 * infinite loop.
-		 */
-		if (!xfs_verify_agino(mp, pag->pag_agno, unlinked_agino) ||
-		    next_agino == unlinked_agino) {
-			XFS_CORRUPTION_ERROR(__func__,
-					XFS_ERRLEVEL_LOW, mp,
-					*dipp, sizeof(**dipp));
-			error = -EFSCORRUPTED;
-			return error;
-		}
-		next_agino = unlinked_agino;
+		error = xfs_iunlink_log_inode(tp, prev_ip, pag,
+				ip->i_next_unlinked);
+		prev_ip->i_next_unlinked = ip->i_next_unlinked;
+	} else {
+		/* Point the head of the list to the next unlinked inode. */
+		error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
+				ip->i_next_unlinked);
 	}
 
-	return 0;
+	ip->i_next_unlinked = NULLAGINO;
+	ip->i_prev_unlinked = NULLAGINO;
+	return error;
 }
 
 /*
@@ -2314,87 +2063,17 @@ xfs_iunlink_remove(
 	struct xfs_perag	*pag,
 	struct xfs_inode	*ip)
 {
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_agi		*agi;
 	struct xfs_buf		*agibp;
-	struct xfs_buf		*last_ibp;
-	struct xfs_dinode	*last_dip = NULL;
-	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-	xfs_agino_t		next_agino;
-	xfs_agino_t		head_agino;
-	short			bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
 	int			error;
 
 	trace_xfs_iunlink_remove(ip);
 
 	/* Get the agi buffer first.  It ensures lock ordering on the list. */
-	error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp);
-	if (error)
-		return error;
-	agi = agibp->b_addr;
-
-	/*
-	 * Get the index into the agi hash table for the list this inode will
-	 * go on.  Make sure the head pointer isn't garbage.
-	 */
-	head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
-	if (!xfs_verify_agino(mp, pag->pag_agno, head_agino)) {
-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
-				agi, sizeof(*agi));
-		return -EFSCORRUPTED;
-	}
-
-	/*
-	 * Set our inode's next_unlinked pointer to NULL and then return
-	 * the old pointer value so that we can update whatever was previous
-	 * to us in the list to point to whatever was next in the list.
-	 */
-	error = xfs_iunlink_update_inode(tp, ip, pag, NULLAGINO, &next_agino);
+	error = xfs_read_agi(pag, tp, &agibp);
 	if (error)
 		return error;
 
-	/*
-	 * If there was a backref pointing from the next inode back to this
-	 * one, remove it because we've removed this inode from the list.
-	 *
-	 * Later, if this inode was in the middle of the list we'll update
-	 * this inode's backref to point from the next inode.
-	 */
-	if (next_agino != NULLAGINO) {
-		error = xfs_iunlink_change_backref(pag, next_agino, NULLAGINO);
-		if (error)
-			return error;
-	}
-
-	if (head_agino != agino) {
-		struct xfs_imap	imap;
-		xfs_agino_t	prev_agino;
-
-		/* We need to search the list for the inode being freed. */
-		error = xfs_iunlink_map_prev(tp, pag, head_agino, agino,
-				&prev_agino, &imap, &last_dip, &last_ibp);
-		if (error)
-			return error;
-
-		/* Point the previous inode on the list to the next inode. */
-		xfs_iunlink_update_dinode(tp, pag, prev_agino, last_ibp,
-				last_dip, &imap, next_agino);
-
-		/*
-		 * Now we deal with the backref for this inode.  If this inode
-		 * pointed at a real inode, change the backref that pointed to
-		 * us to point to our old next.  If this inode was the end of
-		 * the list, delete the backref that pointed to us.  Note that
-		 * change_backref takes care of deleting the backref if
-		 * next_agino is NULLAGINO.
-		 */
-		return xfs_iunlink_change_backref(agibp->b_pag, agino,
-				next_agino);
-	}
-
-	/* Point the head of the list to the next unlinked inode. */
-	return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
-			next_agino);
+	return xfs_iunlink_remove_inode(tp, pag, agibp, ip);
 }
 
 /*
@@ -3032,10 +2711,12 @@ out_trans_abort:
 static int
 xfs_rename_alloc_whiteout(
 	struct user_namespace	*mnt_userns,
+	struct xfs_name		*src_name,
 	struct xfs_inode	*dp,
 	struct xfs_inode	**wip)
 {
 	struct xfs_inode	*tmpfile;
+	struct qstr		name;
 	int			error;
 
 	error = xfs_create_tmpfile(mnt_userns, dp, S_IFCHR | WHITEOUT_MODE,
@@ -3043,6 +2724,15 @@ xfs_rename_alloc_whiteout(
 	if (error)
 		return error;
 
+	name.name = src_name->name;
+	name.len = src_name->len;
+	error = xfs_inode_init_security(VFS_I(tmpfile), VFS_I(dp), &name);
+	if (error) {
+		xfs_finish_inode_setup(tmpfile);
+		xfs_irele(tmpfile);
+		return error;
+	}
+
 	/*
 	 * Prepare the tmpfile inode as if it were created through the VFS.
 	 * Complete the inode setup and flag it as linkable.  nlink is already
@@ -3093,7 +2783,8 @@ xfs_rename(
 	 * appropriately.
 	 */
 	if (flags & RENAME_WHITEOUT) {
-		error = xfs_rename_alloc_whiteout(mnt_userns, target_dp, &wip);
+		error = xfs_rename_alloc_whiteout(mnt_userns, src_name,
+						  target_dp, &wip);
 		if (error)
 			return error;
 
@@ -3229,11 +2920,13 @@ retry:
 		if (inodes[i] == wip ||
 		    (inodes[i] == target_ip &&
 		     (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
-			struct xfs_buf	*bp;
-			xfs_agnumber_t	agno;
+			struct xfs_perag	*pag;
+			struct xfs_buf		*bp;
 
-			agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino);
-			error = xfs_read_agi(mp, tp, agno, &bp);
+			pag = xfs_perag_get(mp,
+					XFS_INO_TO_AGNO(mp, inodes[i]->i_ino));
+			error = xfs_read_agi(pag, tp, &bp);
+			xfs_perag_put(pag);
 			if (error)
 				goto out_trans_cancel;
 		}
@@ -3452,13 +3145,13 @@ xfs_iflush(
 			goto flush_out;
 		}
 	}
-	if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) >
+	if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) >
 				ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
 		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
 			"%s: detected corrupt incore inode %llu, "
 			"total extents = %llu nblocks = %lld, ptr "PTR_FMT,
 			__func__, ip->i_ino,
-			ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp),
+			ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af),
 			ip->i_nblocks, ip);
 		goto flush_out;
 	}
@@ -3488,7 +3181,8 @@ xfs_iflush(
 	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL &&
 	    xfs_ifork_verify_local_data(ip))
 		goto flush_out;
-	if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL &&
+	if (xfs_inode_has_attr_fork(ip) &&
+	    ip->i_af.if_format == XFS_DINODE_FMT_LOCAL &&
 	    xfs_ifork_verify_local_attr(ip))
 		goto flush_out;
 
@@ -3506,7 +3200,7 @@ xfs_iflush(
 	}
 
 	xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
-	if (XFS_IFORK_Q(ip))
+	if (xfs_inode_has_attr_fork(ip))
 		xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
 
 	/*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 7be6f8e705ab..4d626f4321bc 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -33,9 +33,9 @@ typedef struct xfs_inode {
 	struct xfs_imap		i_imap;		/* location for xfs_imap() */
 
 	/* Extent information. */
-	struct xfs_ifork	*i_afp;		/* attribute fork pointer */
 	struct xfs_ifork	*i_cowfp;	/* copy on write extents */
 	struct xfs_ifork	i_df;		/* data fork */
+	struct xfs_ifork	i_af;		/* attribute fork */
 
 	/* Transaction and locking information. */
 	struct xfs_inode_log_item *i_itemp;	/* logging information */
@@ -68,6 +68,10 @@ typedef struct xfs_inode {
 	uint64_t		i_diflags2;	/* XFS_DIFLAG2_... */
 	struct timespec64	i_crtime;	/* time created */
 
+	/* unlinked list pointers */
+	xfs_agino_t		i_next_unlinked;
+	xfs_agino_t		i_prev_unlinked;
+
 	/* VFS inode */
 	struct inode		i_vnode;	/* embedded VFS inode */
 
@@ -77,6 +81,66 @@ typedef struct xfs_inode {
 	struct list_head	i_ioend_list;
 } xfs_inode_t;
 
+static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip)
+{
+	return ip->i_forkoff > 0;
+}
+
+static inline struct xfs_ifork *
+xfs_ifork_ptr(
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	switch (whichfork) {
+	case XFS_DATA_FORK:
+		return &ip->i_df;
+	case XFS_ATTR_FORK:
+		if (!xfs_inode_has_attr_fork(ip))
+			return NULL;
+		return &ip->i_af;
+	case XFS_COW_FORK:
+		return ip->i_cowfp;
+	default:
+		ASSERT(0);
+		return NULL;
+	}
+}
+
+static inline unsigned int xfs_inode_fork_boff(struct xfs_inode *ip)
+{
+	return ip->i_forkoff << 3;
+}
+
+static inline unsigned int xfs_inode_data_fork_size(struct xfs_inode *ip)
+{
+	if (xfs_inode_has_attr_fork(ip))
+		return xfs_inode_fork_boff(ip);
+
+	return XFS_LITINO(ip->i_mount);
+}
+
+static inline unsigned int xfs_inode_attr_fork_size(struct xfs_inode *ip)
+{
+	if (xfs_inode_has_attr_fork(ip))
+		return XFS_LITINO(ip->i_mount) - xfs_inode_fork_boff(ip);
+	return 0;
+}
+
+static inline unsigned int
+xfs_inode_fork_size(
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	switch (whichfork) {
+	case XFS_DATA_FORK:
+		return xfs_inode_data_fork_size(ip);
+	case XFS_ATTR_FORK:
+		return xfs_inode_attr_fork_size(ip);
+	default:
+		return 0;
+	}
+}
+
 /* Convert from vfs inode to xfs inode */
 static inline struct xfs_inode *XFS_I(struct inode *inode)
 {
@@ -505,9 +569,6 @@ extern struct kmem_cache	*xfs_inode_cache;
 
 bool xfs_inode_needs_inactive(struct xfs_inode *ip);
 
-int xfs_iunlink_init(struct xfs_perag *pag);
-void xfs_iunlink_destroy(struct xfs_perag *pag);
-
 void xfs_end_io(struct work_struct *work);
 
 int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 721def0639fd..6e19ece916bf 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -57,7 +57,7 @@ xfs_inode_item_data_fork_size(
 		    ip->i_df.if_nextents > 0 &&
 		    ip->i_df.if_bytes > 0) {
 			/* worst case, doesn't subtract delalloc extents */
-			*nbytes += XFS_IFORK_DSIZE(ip);
+			*nbytes += xfs_inode_data_fork_size(ip);
 			*nvecs += 1;
 		}
 		break;
@@ -92,27 +92,27 @@ xfs_inode_item_attr_fork_size(
 {
 	struct xfs_inode	*ip = iip->ili_inode;
 
-	switch (ip->i_afp->if_format) {
+	switch (ip->i_af.if_format) {
 	case XFS_DINODE_FMT_EXTENTS:
 		if ((iip->ili_fields & XFS_ILOG_AEXT) &&
-		    ip->i_afp->if_nextents > 0 &&
-		    ip->i_afp->if_bytes > 0) {
+		    ip->i_af.if_nextents > 0 &&
+		    ip->i_af.if_bytes > 0) {
 			/* worst case, doesn't subtract unused space */
-			*nbytes += XFS_IFORK_ASIZE(ip);
+			*nbytes += xfs_inode_attr_fork_size(ip);
 			*nvecs += 1;
 		}
 		break;
 	case XFS_DINODE_FMT_BTREE:
 		if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
-		    ip->i_afp->if_broot_bytes > 0) {
-			*nbytes += ip->i_afp->if_broot_bytes;
+		    ip->i_af.if_broot_bytes > 0) {
+			*nbytes += ip->i_af.if_broot_bytes;
 			*nvecs += 1;
 		}
 		break;
 	case XFS_DINODE_FMT_LOCAL:
 		if ((iip->ili_fields & XFS_ILOG_ADATA) &&
-		    ip->i_afp->if_bytes > 0) {
-			*nbytes += xlog_calc_iovec_len(ip->i_afp->if_bytes);
+		    ip->i_af.if_bytes > 0) {
+			*nbytes += xlog_calc_iovec_len(ip->i_af.if_bytes);
 			*nvecs += 1;
 		}
 		break;
@@ -143,7 +143,7 @@ xfs_inode_item_size(
 		   xfs_log_dinode_size(ip->i_mount);
 
 	xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
-	if (XFS_IFORK_Q(ip))
+	if (xfs_inode_has_attr_fork(ip))
 		xfs_inode_item_attr_fork_size(iip, nvecs, nbytes);
 }
 
@@ -237,18 +237,18 @@ xfs_inode_item_format_attr_fork(
 	struct xfs_inode	*ip = iip->ili_inode;
 	size_t			data_bytes;
 
-	switch (ip->i_afp->if_format) {
+	switch (ip->i_af.if_format) {
 	case XFS_DINODE_FMT_EXTENTS:
 		iip->ili_fields &=
 			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
 
 		if ((iip->ili_fields & XFS_ILOG_AEXT) &&
-		    ip->i_afp->if_nextents > 0 &&
-		    ip->i_afp->if_bytes > 0) {
+		    ip->i_af.if_nextents > 0 &&
+		    ip->i_af.if_bytes > 0) {
 			struct xfs_bmbt_rec *p;
 
-			ASSERT(xfs_iext_count(ip->i_afp) ==
-				ip->i_afp->if_nextents);
+			ASSERT(xfs_iext_count(&ip->i_af) ==
+				ip->i_af.if_nextents);
 
 			p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT);
 			data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK);
@@ -265,13 +265,13 @@ xfs_inode_item_format_attr_fork(
 			~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
 
 		if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
-		    ip->i_afp->if_broot_bytes > 0) {
-			ASSERT(ip->i_afp->if_broot != NULL);
+		    ip->i_af.if_broot_bytes > 0) {
+			ASSERT(ip->i_af.if_broot != NULL);
 
 			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT,
-					ip->i_afp->if_broot,
-					ip->i_afp->if_broot_bytes);
-			ilf->ilf_asize = ip->i_afp->if_broot_bytes;
+					ip->i_af.if_broot,
+					ip->i_af.if_broot_bytes);
+			ilf->ilf_asize = ip->i_af.if_broot_bytes;
 			ilf->ilf_size++;
 		} else {
 			iip->ili_fields &= ~XFS_ILOG_ABROOT;
@@ -282,12 +282,12 @@ xfs_inode_item_format_attr_fork(
 			~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
 
 		if ((iip->ili_fields & XFS_ILOG_ADATA) &&
-		    ip->i_afp->if_bytes > 0) {
-			ASSERT(ip->i_afp->if_u1.if_data != NULL);
+		    ip->i_af.if_bytes > 0) {
+			ASSERT(ip->i_af.if_u1.if_data != NULL);
 			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
-					ip->i_afp->if_u1.if_data,
-					ip->i_afp->if_bytes);
-			ilf->ilf_asize = (unsigned)ip->i_afp->if_bytes;
+					ip->i_af.if_u1.if_data,
+					ip->i_af.if_bytes);
+			ilf->ilf_asize = (unsigned)ip->i_af.if_bytes;
 			ilf->ilf_size++;
 		} else {
 			iip->ili_fields &= ~XFS_ILOG_ADATA;
@@ -355,11 +355,11 @@ xfs_inode_to_log_dinode_iext_counters(
 {
 	if (xfs_inode_has_large_extent_counts(ip)) {
 		to->di_big_nextents = xfs_ifork_nextents(&ip->i_df);
-		to->di_big_anextents = xfs_ifork_nextents(ip->i_afp);
+		to->di_big_anextents = xfs_ifork_nextents(&ip->i_af);
 		to->di_nrext64_pad = 0;
 	} else {
 		to->di_nextents = xfs_ifork_nextents(&ip->i_df);
-		to->di_anextents = xfs_ifork_nextents(ip->i_afp);
+		to->di_anextents = xfs_ifork_nextents(&ip->i_af);
 	}
 }
 
@@ -390,7 +390,7 @@ xfs_inode_to_log_dinode(
 	to->di_nblocks = ip->i_nblocks;
 	to->di_extsize = ip->i_extsize;
 	to->di_forkoff = ip->i_forkoff;
-	to->di_aformat = xfs_ifork_format(ip->i_afp);
+	to->di_aformat = xfs_ifork_format(&ip->i_af);
 	to->di_flags = ip->i_diflags;
 
 	xfs_copy_dm_fields_to_log_dinode(ip, to);
@@ -480,7 +480,7 @@ xfs_inode_item_format(
 
 	xfs_inode_item_format_core(ip, lv, &vecp);
 	xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
-	if (XFS_IFORK_Q(ip)) {
+	if (xfs_inode_has_attr_fork(ip)) {
 		xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
 	} else {
 		iip->ili_fields &=
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0d67ff8a8961..1f783e979629 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -955,6 +955,7 @@ xfs_ioc_ag_geometry(
 	struct xfs_mount	*mp,
 	void			__user *arg)
 {
+	struct xfs_perag	*pag;
 	struct xfs_ag_geometry	ageo;
 	int			error;
 
@@ -965,7 +966,12 @@ xfs_ioc_ag_geometry(
 	if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved)))
 		return -EINVAL;
 
-	error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo);
+	pag = xfs_perag_get(mp, ageo.ag_number);
+	if (!pag)
+		return -EINVAL;
+
+	error = xfs_ag_get_geometry(pag, &ageo);
+	xfs_perag_put(pag);
 	if (error)
 		return error;
 
@@ -985,7 +991,7 @@ xfs_fill_fsxattr(
 	struct fileattr		*fa)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 
 	fileattr_fill_xflags(fa, xfs_ip2xflags(ip));
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 5d50fed291b4..2817d3dd25eb 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -159,7 +159,7 @@ xfs_iomap_eof_align_last_fsb(
 	struct xfs_inode	*ip,
 	xfs_fileoff_t		end_fsb)
 {
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
 	xfs_extlen_t		extsz = xfs_get_extsz_hint(ip);
 	xfs_extlen_t		align = xfs_eof_alignment(ip);
 	struct xfs_bmbt_irec	irec;
@@ -370,7 +370,7 @@ xfs_iomap_prealloc_size(
 	struct xfs_iext_cursor	ncur = *icur;
 	struct xfs_bmbt_irec	prev, got;
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	int64_t			freesp;
 	xfs_fsblock_t		qblocks;
@@ -1310,12 +1310,12 @@ xfs_xattr_iomap_begin(
 	lockmode = xfs_ilock_attr_map_shared(ip);
 
 	/* if there are no attribute fork or extents, return ENOENT */
-	if (!XFS_IFORK_Q(ip) || !ip->i_afp->if_nextents) {
+	if (!xfs_inode_has_attr_fork(ip) || !ip->i_af.if_nextents) {
 		error = -ENOENT;
 		goto out_unlock;
 	}
 
-	ASSERT(ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL);
+	ASSERT(ip->i_af.if_format != XFS_DINODE_FMT_LOCAL);
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
 			       &nimaps, XFS_BMAPI_ATTRFORK);
 out_unlock:
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a7402f6ea510..45518b8c613c 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -75,9 +75,8 @@ xfs_initxattrs(
  * these attrs can be journalled at inode creation time (along with the
  * inode, of course, such that log replay can't cause these to be lost).
  */
-
-STATIC int
-xfs_init_security(
+int
+xfs_inode_init_security(
 	struct inode	*inode,
 	struct inode	*dir,
 	const struct qstr *qstr)
@@ -122,7 +121,7 @@ xfs_cleanup_inode(
 
 	/* Oh, the horror.
 	 * If we can't add the ACL or we fail in
-	 * xfs_init_security we must back out.
+	 * xfs_inode_init_security we must back out.
 	 * ENOSPC can hit here, among other things.
 	 */
 	xfs_dentry_to_name(&teardown, dentry);
@@ -208,7 +207,7 @@ xfs_generic_create(
 
 	inode = VFS_I(ip);
 
-	error = xfs_init_security(inode, dir, &dentry->d_name);
+	error = xfs_inode_init_security(inode, dir, &dentry->d_name);
 	if (unlikely(error))
 		goto out_cleanup_inode;
 
@@ -424,7 +423,7 @@ xfs_vn_symlink(
 
 	inode = VFS_I(cip);
 
-	error = xfs_init_security(inode, dir, &dentry->d_name);
+	error = xfs_inode_init_security(inode, dir, &dentry->d_name);
 	if (unlikely(error))
 		goto out_cleanup_inode;
 
@@ -1282,7 +1281,7 @@ xfs_setup_inode(
 	 * If there is no attribute fork no ACL can exist on this inode,
 	 * and it can't have any file capabilities attached to it either.
 	 */
-	if (!XFS_IFORK_Q(ip)) {
+	if (!xfs_inode_has_attr_fork(ip)) {
 		inode_has_no_xattr(inode);
 		cache_no_acl(inode);
 	}
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 278949056048..cb5fc68c9ea0 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -17,4 +17,7 @@ extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr);
 int xfs_vn_setattr_size(struct user_namespace *mnt_userns,
 		struct dentry *dentry, struct iattr *vap);
 
+int xfs_inode_init_security(struct inode *inode, struct inode *dir,
+		const struct qstr *qstr);
+
 #endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f74c9fff72bb..36312b00b164 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -111,8 +111,8 @@ xfs_bulkstat_one_int(
 		buf->bs_extents64 = nextents;
 
 	xfs_bulkstat_health(ip, buf);
-	buf->bs_aextents = xfs_ifork_nextents(ip->i_afp);
-	buf->bs_forkoff = XFS_IFORK_BOFF(ip);
+	buf->bs_aextents = xfs_ifork_nextents(&ip->i_af);
+	buf->bs_forkoff = xfs_inode_fork_boff(ip);
 	buf->bs_version = XFS_BULKSTAT_VERSION_V5;
 
 	if (xfs_has_v3inodes(mp)) {
diff --git a/fs/xfs/xfs_iunlink_item.c b/fs/xfs/xfs_iunlink_item.c
new file mode 100644
index 000000000000..43005ce8bd48
--- /dev/null
+++ b/fs/xfs/xfs_iunlink_item.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020-2022, Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_ag.h"
+#include "xfs_iunlink_item.h"
+#include "xfs_trace.h"
+#include "xfs_error.h"
+
+struct kmem_cache	*xfs_iunlink_cache;
+
+static inline struct xfs_iunlink_item *IUL_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_iunlink_item, item);
+}
+
+static void
+xfs_iunlink_item_release(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_iunlink_item	*iup = IUL_ITEM(lip);
+
+	xfs_perag_put(iup->pag);
+	kmem_cache_free(xfs_iunlink_cache, IUL_ITEM(lip));
+}
+
+
+static uint64_t
+xfs_iunlink_item_sort(
+	struct xfs_log_item	*lip)
+{
+	return IUL_ITEM(lip)->ip->i_ino;
+}
+
+/*
+ * Look up the inode cluster buffer and log the on-disk unlinked inode change
+ * we need to make.
+ */
+static int
+xfs_iunlink_log_dinode(
+	struct xfs_trans	*tp,
+	struct xfs_iunlink_item	*iup)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_inode	*ip = iup->ip;
+	struct xfs_dinode	*dip;
+	struct xfs_buf		*ibp;
+	int			offset;
+	int			error;
+
+	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp);
+	if (error)
+		return error;
+	/*
+	 * Don't log the unlinked field on stale buffers as this may be the
+	 * transaction that frees the inode cluster and relogging the buffer
+	 * here will incorrectly remove the stale state.
+	 */
+	if (ibp->b_flags & XBF_STALE)
+		goto out;
+
+	dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset);
+
+	/* Make sure the old pointer isn't garbage. */
+	if (be32_to_cpu(dip->di_next_unlinked) != iup->old_agino) {
+		xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
+				sizeof(*dip), __this_address);
+		error = -EFSCORRUPTED;
+		goto out;
+	}
+
+	trace_xfs_iunlink_update_dinode(mp, iup->pag->pag_agno,
+			XFS_INO_TO_AGINO(mp, ip->i_ino),
+			be32_to_cpu(dip->di_next_unlinked), iup->next_agino);
+
+	dip->di_next_unlinked = cpu_to_be32(iup->next_agino);
+	offset = ip->i_imap.im_boffset +
+			offsetof(struct xfs_dinode, di_next_unlinked);
+
+	xfs_dinode_calc_crc(mp, dip);
+	xfs_trans_inode_buf(tp, ibp);
+	xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
+	return 0;
+out:
+	xfs_trans_brelse(tp, ibp);
+	return error;
+}
+
+/*
+ * On precommit, we grab the inode cluster buffer for the inode number we were
+ * passed, then update the next unlinked field for that inode in the buffer and
+ * log the buffer. This ensures that the inode cluster buffer was logged in the
+ * correct order w.r.t. other inode cluster buffers. We can then remove the
+ * iunlink item from the transaction and release it as it is has now served it's
+ * purpose.
+ */
+static int
+xfs_iunlink_item_precommit(
+	struct xfs_trans	*tp,
+	struct xfs_log_item	*lip)
+{
+	struct xfs_iunlink_item	*iup = IUL_ITEM(lip);
+	int			error;
+
+	error = xfs_iunlink_log_dinode(tp, iup);
+	list_del(&lip->li_trans);
+	xfs_iunlink_item_release(lip);
+	return error;
+}
+
+static const struct xfs_item_ops xfs_iunlink_item_ops = {
+	.iop_release	= xfs_iunlink_item_release,
+	.iop_sort	= xfs_iunlink_item_sort,
+	.iop_precommit	= xfs_iunlink_item_precommit,
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ *
+ * Inode extents can only reside within an AG. Hence specify the starting
+ * block for the inode chunk by offset within an AG as well as the
+ * length of the allocated extent.
+ *
+ * This joins the item to the transaction and marks it dirty so
+ * that we don't need a separate call to do this, nor does the
+ * caller need to know anything about the iunlink item.
+ */
+int
+xfs_iunlink_log_inode(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
+	xfs_agino_t		next_agino)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_iunlink_item	*iup;
+
+	ASSERT(xfs_verify_agino_or_null(pag, next_agino));
+	ASSERT(xfs_verify_agino_or_null(pag, ip->i_next_unlinked));
+
+	/*
+	 * Since we're updating a linked list, we should never find that the
+	 * current pointer is the same as the new value, unless we're
+	 * terminating the list.
+	 */
+	if (ip->i_next_unlinked == next_agino) {
+		if (next_agino != NULLAGINO)
+			return -EFSCORRUPTED;
+		return 0;
+	}
+
+	iup = kmem_cache_zalloc(xfs_iunlink_cache, GFP_KERNEL | __GFP_NOFAIL);
+	xfs_log_item_init(mp, &iup->item, XFS_LI_IUNLINK,
+			  &xfs_iunlink_item_ops);
+
+	iup->ip = ip;
+	iup->next_agino = next_agino;
+	iup->old_agino = ip->i_next_unlinked;
+
+	atomic_inc(&pag->pag_ref);
+	iup->pag = pag;
+
+	xfs_trans_add_item(tp, &iup->item);
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &iup->item.li_flags);
+	return 0;
+}
+
diff --git a/fs/xfs/xfs_iunlink_item.h b/fs/xfs/xfs_iunlink_item.h
new file mode 100644
index 000000000000..c793cdcaccde
--- /dev/null
+++ b/fs/xfs/xfs_iunlink_item.h
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020-2022, Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef XFS_IUNLINK_ITEM_H
+#define XFS_IUNLINK_ITEM_H	1
+
+struct xfs_trans;
+struct xfs_inode;
+struct xfs_perag;
+
+/* in memory log item structure */
+struct xfs_iunlink_item {
+	struct xfs_log_item	item;
+	struct xfs_inode	*ip;
+	struct xfs_perag	*pag;
+	xfs_agino_t		next_agino;
+	xfs_agino_t		old_agino;
+};
+
+extern struct kmem_cache *xfs_iunlink_cache;
+
+int xfs_iunlink_log_inode(struct xfs_trans *tp, struct xfs_inode *ip,
+			struct xfs_perag *pag, xfs_agino_t next_agino);
+
+#endif	/* XFS_IUNLINK_ITEM_H */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ae904b21e9cc..4b1c0a9c6368 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -57,7 +57,8 @@ xlog_grant_push_ail(
 STATIC void
 xlog_sync(
 	struct xlog		*log,
-	struct xlog_in_core	*iclog);
+	struct xlog_in_core	*iclog,
+	struct xlog_ticket	*ticket);
 #if defined(DEBUG)
 STATIC void
 xlog_verify_grant_tail(
@@ -567,7 +568,8 @@ xlog_state_shutdown_callbacks(
 int
 xlog_state_release_iclog(
 	struct xlog		*log,
-	struct xlog_in_core	*iclog)
+	struct xlog_in_core	*iclog,
+	struct xlog_ticket	*ticket)
 {
 	xfs_lsn_t		tail_lsn;
 	bool			last_ref;
@@ -614,7 +616,7 @@ xlog_state_release_iclog(
 	trace_xlog_iclog_syncing(iclog, _RET_IP_);
 
 	spin_unlock(&log->l_icloglock);
-	xlog_sync(log, iclog);
+	xlog_sync(log, iclog, ticket);
 	spin_lock(&log->l_icloglock);
 	return 0;
 }
@@ -881,7 +883,7 @@ xlog_force_iclog(
 	iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
 	if (iclog->ic_state == XLOG_STATE_ACTIVE)
 		xlog_state_switch_iclogs(iclog->ic_log, iclog, 0);
-	return xlog_state_release_iclog(iclog->ic_log, iclog);
+	return xlog_state_release_iclog(iclog->ic_log, iclog, NULL);
 }
 
 /*
@@ -944,6 +946,8 @@ xlog_write_unmount_record(
 		.lv_niovecs = 1,
 		.lv_iovecp = &reg,
 	};
+	LIST_HEAD(lv_chain);
+	list_add(&vec.lv_list, &lv_chain);
 
 	BUILD_BUG_ON((sizeof(struct xlog_op_header) +
 		      sizeof(struct xfs_unmount_log_format)) !=
@@ -952,7 +956,7 @@ xlog_write_unmount_record(
 	/* account for space used by record data */
 	ticket->t_curr_res -= sizeof(unmount_rec);
 
-	return xlog_write(log, NULL, &vec, ticket, reg.i_len);
+	return xlog_write(log, NULL, &lv_chain, ticket, reg.i_len);
 }
 
 /*
@@ -2000,7 +2004,7 @@ xlog_calc_iclog_size(
 }
 
 /*
- * Flush out the in-core log (iclog) to the on-disk log in an asynchronous 
+ * Flush out the in-core log (iclog) to the on-disk log in an asynchronous
  * fashion.  Previously, we should have moved the current iclog
  * ptr in the log to point to the next available iclog.  This allows further
  * write to continue while this code syncs out an iclog ready to go.
@@ -2025,7 +2029,8 @@ xlog_calc_iclog_size(
 STATIC void
 xlog_sync(
 	struct xlog		*log,
-	struct xlog_in_core	*iclog)
+	struct xlog_in_core	*iclog,
+	struct xlog_ticket	*ticket)
 {
 	unsigned int		count;		/* byte count of bwrite */
 	unsigned int		roundoff;       /* roundoff to BB or stripe */
@@ -2037,12 +2042,20 @@ xlog_sync(
 
 	count = xlog_calc_iclog_size(log, iclog, &roundoff);
 
-	/* move grant heads by roundoff in sync */
-	xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
-	xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
+	/*
+	 * If we have a ticket, account for the roundoff via the ticket
+	 * reservation to avoid touching the hot grant heads needlessly.
+	 * Otherwise, we have to move grant heads directly.
+	 */
+	if (ticket) {
+		ticket->t_curr_res -= roundoff;
+	} else {
+		xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
+		xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
+	}
 
 	/* put cycle number in every block */
-	xlog_pack_data(log, iclog, roundoff); 
+	xlog_pack_data(log, iclog, roundoff);
 
 	/* real byte length */
 	size = iclog->ic_offset;
@@ -2275,7 +2288,7 @@ xlog_write_get_more_iclog_space(
 	spin_lock(&log->l_icloglock);
 	ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC);
 	xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
-	error = xlog_state_release_iclog(log, iclog);
+	error = xlog_state_release_iclog(log, iclog, ticket);
 	spin_unlock(&log->l_icloglock);
 	if (error)
 		return error;
@@ -2471,13 +2484,13 @@ int
 xlog_write(
 	struct xlog		*log,
 	struct xfs_cil_ctx	*ctx,
-	struct xfs_log_vec	*log_vector,
+	struct list_head	*lv_chain,
 	struct xlog_ticket	*ticket,
 	uint32_t		len)
 
 {
 	struct xlog_in_core	*iclog = NULL;
-	struct xfs_log_vec	*lv = log_vector;
+	struct xfs_log_vec	*lv;
 	uint32_t		record_cnt = 0;
 	uint32_t		data_cnt = 0;
 	int			error = 0;
@@ -2505,7 +2518,7 @@ xlog_write(
 	if (ctx)
 		xlog_cil_set_ctx_write_state(ctx, iclog);
 
-	while (lv) {
+	list_for_each_entry(lv, lv_chain, lv_list) {
 		/*
 		 * If the entire log vec does not fit in the iclog, punt it to
 		 * the partial copy loop which can handle this case.
@@ -2526,7 +2539,6 @@ xlog_write(
 			xlog_write_full(lv, ticket, iclog, &log_offset,
 					 &len, &record_cnt, &data_cnt);
 		}
-		lv = lv->lv_next;
 	}
 	ASSERT(len == 0);
 
@@ -2538,7 +2550,7 @@ xlog_write(
 	 */
 	spin_lock(&log->l_icloglock);
 	xlog_state_finish_copy(log, iclog, record_cnt, 0);
-	error = xlog_state_release_iclog(log, iclog);
+	error = xlog_state_release_iclog(log, iclog, ticket);
 	spin_unlock(&log->l_icloglock);
 
 	return error;
@@ -2958,7 +2970,7 @@ restart:
 		 * reference to the iclog.
 		 */
 		if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
-			error = xlog_state_release_iclog(log, iclog);
+			error = xlog_state_release_iclog(log, iclog, ticket);
 		spin_unlock(&log->l_icloglock);
 		if (error)
 			return error;
@@ -3406,7 +3418,8 @@ xfs_log_ticket_get(
 static int
 xlog_calc_unit_res(
 	struct xlog		*log,
-	int			unit_bytes)
+	int			unit_bytes,
+	int			*niclogs)
 {
 	int			iclog_space;
 	uint			num_headers;
@@ -3486,6 +3499,8 @@ xlog_calc_unit_res(
 	/* roundoff padding for transaction data and one for commit record */
 	unit_bytes += 2 * log->l_iclog_roundoff;
 
+	if (niclogs)
+		*niclogs = num_headers;
 	return unit_bytes;
 }
 
@@ -3494,7 +3509,7 @@ xfs_log_calc_unit_res(
 	struct xfs_mount	*mp,
 	int			unit_bytes)
 {
-	return xlog_calc_unit_res(mp->m_log, unit_bytes);
+	return xlog_calc_unit_res(mp->m_log, unit_bytes, NULL);
 }
 
 /*
@@ -3512,7 +3527,7 @@ xlog_ticket_alloc(
 
 	tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL);
 
-	unit_res = xlog_calc_unit_res(log, unit_bytes);
+	unit_res = xlog_calc_unit_res(log, unit_bytes, &tic->t_iclog_hdrs);
 
 	atomic_set(&tic->t_ref, 1);
 	tic->t_task		= current;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index f3ce046a7d45..2728886c2963 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -9,7 +9,8 @@
 struct xfs_cil_ctx;
 
 struct xfs_log_vec {
-	struct xfs_log_vec	*lv_next;	/* next lv in build list */
+	struct list_head	lv_list;	/* CIL lv chain ptrs */
+	uint32_t		lv_order_id;	/* chain ordering info */
 	int			lv_niovecs;	/* number of iovecs in lv */
 	struct xfs_log_iovec	*lv_iovecp;	/* iovec array */
 	struct xfs_log_item	*lv_item;	/* owner */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index db6cb7800251..eccbfb99e894 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -44,9 +44,20 @@ xlog_cil_ticket_alloc(
 	 * transaction overhead reservation from the first transaction commit.
 	 */
 	tic->t_curr_res = 0;
+	tic->t_iclog_hdrs = 0;
 	return tic;
 }
 
+static inline void
+xlog_cil_set_iclog_hdr_count(struct xfs_cil *cil)
+{
+	struct xlog	*log = cil->xc_log;
+
+	atomic_set(&cil->xc_iclog_hdrs,
+		   (XLOG_CIL_BLOCKING_SPACE_LIMIT(log) /
+			(log->l_iclog_size - log->l_iclog_hsize)));
+}
+
 /*
  * Check if the current log item was first committed in this sequence.
  * We can't rely on just the log item being in the CIL, we have to check
@@ -61,7 +72,7 @@ xlog_item_in_current_chkpt(
 	struct xfs_cil		*cil,
 	struct xfs_log_item	*lip)
 {
-	if (list_empty(&lip->li_cil))
+	if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags))
 		return false;
 
 	/*
@@ -93,15 +104,88 @@ xlog_cil_ctx_alloc(void)
 	ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS);
 	INIT_LIST_HEAD(&ctx->committing);
 	INIT_LIST_HEAD(&ctx->busy_extents);
+	INIT_LIST_HEAD(&ctx->log_items);
+	INIT_LIST_HEAD(&ctx->lv_chain);
 	INIT_WORK(&ctx->push_work, xlog_cil_push_work);
 	return ctx;
 }
 
+/*
+ * Aggregate the CIL per cpu structures into global counts, lists, etc and
+ * clear the percpu state ready for the next context to use. This is called
+ * from the push code with the context lock held exclusively, hence nothing else
+ * will be accessing or modifying the per-cpu counters.
+ */
+static void
+xlog_cil_push_pcp_aggregate(
+	struct xfs_cil		*cil,
+	struct xfs_cil_ctx	*ctx)
+{
+	struct xlog_cil_pcp	*cilpcp;
+	int			cpu;
+
+	for_each_online_cpu(cpu) {
+		cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+
+		ctx->ticket->t_curr_res += cilpcp->space_reserved;
+		cilpcp->space_reserved = 0;
+
+		if (!list_empty(&cilpcp->busy_extents)) {
+			list_splice_init(&cilpcp->busy_extents,
+					&ctx->busy_extents);
+		}
+		if (!list_empty(&cilpcp->log_items))
+			list_splice_init(&cilpcp->log_items, &ctx->log_items);
+
+		/*
+		 * We're in the middle of switching cil contexts.  Reset the
+		 * counter we use to detect when the current context is nearing
+		 * full.
+		 */
+		cilpcp->space_used = 0;
+	}
+}
+
+/*
+ * Aggregate the CIL per-cpu space used counters into the global atomic value.
+ * This is called when the per-cpu counter aggregation will first pass the soft
+ * limit threshold so we can switch to atomic counter aggregation for accurate
+ * detection of hard limit traversal.
+ */
+static void
+xlog_cil_insert_pcp_aggregate(
+	struct xfs_cil		*cil,
+	struct xfs_cil_ctx	*ctx)
+{
+	struct xlog_cil_pcp	*cilpcp;
+	int			cpu;
+	int			count = 0;
+
+	/* Trigger atomic updates then aggregate only for the first caller */
+	if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags))
+		return;
+
+	for_each_online_cpu(cpu) {
+		int	old, prev;
+
+		cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+		do {
+			old = cilpcp->space_used;
+			prev = cmpxchg(&cilpcp->space_used, old, 0);
+		} while (old != prev);
+		count += old;
+	}
+	atomic_add(count, &ctx->space_used);
+}
+
 static void
 xlog_cil_ctx_switch(
 	struct xfs_cil		*cil,
 	struct xfs_cil_ctx	*ctx)
 {
+	xlog_cil_set_iclog_hdr_count(cil);
+	set_bit(XLOG_CIL_EMPTY, &cil->xc_flags);
+	set_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags);
 	ctx->sequence = ++cil->xc_current_sequence;
 	ctx->cil = cil;
 	cil->xc_ctx = ctx;
@@ -123,6 +207,7 @@ xlog_cil_init_post_recovery(
 {
 	log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
 	log->l_cilp->xc_ctx->sequence = 1;
+	xlog_cil_set_iclog_hdr_count(log->l_cilp);
 }
 
 static inline int
@@ -254,6 +339,7 @@ xlog_cil_alloc_shadow_bufs(
 
 			memset(lv, 0, xlog_cil_iovec_space(niovecs));
 
+			INIT_LIST_HEAD(&lv->lv_list);
 			lv->lv_item = lip;
 			lv->lv_size = buf_size;
 			if (ordered)
@@ -269,7 +355,6 @@ xlog_cil_alloc_shadow_bufs(
 			else
 				lv->lv_buf_len = 0;
 			lv->lv_bytes = 0;
-			lv->lv_next = NULL;
 		}
 
 		/* Ensure the lv is set up according to ->iop_size */
@@ -396,7 +481,6 @@ xlog_cil_insert_format_items(
 		if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
 			/* same or smaller, optimise common overwrite case */
 			lv = lip->li_lv;
-			lv->lv_next = NULL;
 
 			if (ordered)
 				goto insert;
@@ -433,6 +517,23 @@ insert:
 	}
 }
 
+/*
+ * The use of lockless waitqueue_active() requires that the caller has
+ * serialised itself against the wakeup call in xlog_cil_push_work(). That
+ * can be done by either holding the push lock or the context lock.
+ */
+static inline bool
+xlog_cil_over_hard_limit(
+	struct xlog	*log,
+	int32_t		space_used)
+{
+	if (waitqueue_active(&log->l_cilp->xc_push_wait))
+		return true;
+	if (space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log))
+		return true;
+	return false;
+}
+
 /*
  * Insert the log items into the CIL and calculate the difference in space
  * consumed by the item. Add the space to the checkpoint ticket and calculate
@@ -450,8 +551,10 @@ xlog_cil_insert_items(
 	struct xfs_cil_ctx	*ctx = cil->xc_ctx;
 	struct xfs_log_item	*lip;
 	int			len = 0;
-	int			iclog_space;
 	int			iovhdr_res = 0, split_res = 0, ctx_res = 0;
+	int			space_used;
+	int			order;
+	struct xlog_cil_pcp	*cilpcp;
 
 	ASSERT(tp);
 
@@ -461,46 +564,112 @@ xlog_cil_insert_items(
 	 */
 	xlog_cil_insert_format_items(log, tp, &len);
 
-	spin_lock(&cil->xc_cil_lock);
-
-	/* attach the transaction to the CIL if it has any busy extents */
-	if (!list_empty(&tp->t_busy))
-		list_splice_init(&tp->t_busy, &ctx->busy_extents);
+	/*
+	 * Subtract the space released by intent cancelation from the space we
+	 * consumed so that we remove it from the CIL space and add it back to
+	 * the current transaction reservation context.
+	 */
+	len -= released_space;
 
 	/*
-	 * Now transfer enough transaction reservation to the context ticket
-	 * for the checkpoint. The context ticket is special - the unit
-	 * reservation has to grow as well as the current reservation as we
-	 * steal from tickets so we can correctly determine the space used
-	 * during the transaction commit.
+	 * Grab the per-cpu pointer for the CIL before we start any accounting.
+	 * That ensures that we are running with pre-emption disabled and so we
+	 * can't be scheduled away between split sample/update operations that
+	 * are done without outside locking to serialise them.
 	 */
-	if (ctx->ticket->t_curr_res == 0) {
-		ctx_res = ctx->ticket->t_unit_res;
-		ctx->ticket->t_curr_res = ctx_res;
-		tp->t_ticket->t_curr_res -= ctx_res;
-	}
+	cilpcp = get_cpu_ptr(cil->xc_pcp);
 
-	/* do we need space for more log record headers? */
-	iclog_space = log->l_iclog_size - log->l_iclog_hsize;
-	if (len > 0 && (ctx->space_used / iclog_space !=
-				(ctx->space_used + len) / iclog_space)) {
-		split_res = (len + iclog_space - 1) / iclog_space;
-		/* need to take into account split region headers, too */
-		split_res *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
-		ctx->ticket->t_unit_res += split_res;
-		ctx->ticket->t_curr_res += split_res;
-		tp->t_ticket->t_curr_res -= split_res;
-		ASSERT(tp->t_ticket->t_curr_res >= len);
+	/*
+	 * We need to take the CIL checkpoint unit reservation on the first
+	 * commit into the CIL. Test the XLOG_CIL_EMPTY bit first so we don't
+	 * unnecessarily do an atomic op in the fast path here. We can clear the
+	 * XLOG_CIL_EMPTY bit as we are under the xc_ctx_lock here and that
+	 * needs to be held exclusively to reset the XLOG_CIL_EMPTY bit.
+	 */
+	if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) &&
+	    test_and_clear_bit(XLOG_CIL_EMPTY, &cil->xc_flags))
+		ctx_res = ctx->ticket->t_unit_res;
+
+	/*
+	 * Check if we need to steal iclog headers. atomic_read() is not a
+	 * locked atomic operation, so we can check the value before we do any
+	 * real atomic ops in the fast path. If we've already taken the CIL unit
+	 * reservation from this commit, we've already got one iclog header
+	 * space reserved so we have to account for that otherwise we risk
+	 * overrunning the reservation on this ticket.
+	 *
+	 * If the CIL is already at the hard limit, we might need more header
+	 * space that originally reserved. So steal more header space from every
+	 * commit that occurs once we are over the hard limit to ensure the CIL
+	 * push won't run out of reservation space.
+	 *
+	 * This can steal more than we need, but that's OK.
+	 *
+	 * The cil->xc_ctx_lock provides the serialisation necessary for safely
+	 * calling xlog_cil_over_hard_limit() in this context.
+	 */
+	space_used = atomic_read(&ctx->space_used) + cilpcp->space_used + len;
+	if (atomic_read(&cil->xc_iclog_hdrs) > 0 ||
+	    xlog_cil_over_hard_limit(log, space_used)) {
+		split_res = log->l_iclog_hsize +
+					sizeof(struct xlog_op_header);
+		if (ctx_res)
+			ctx_res += split_res * (tp->t_ticket->t_iclog_hdrs - 1);
+		else
+			ctx_res = split_res * tp->t_ticket->t_iclog_hdrs;
+		atomic_sub(tp->t_ticket->t_iclog_hdrs, &cil->xc_iclog_hdrs);
 	}
-	tp->t_ticket->t_curr_res -= len;
-	tp->t_ticket->t_curr_res += released_space;
-	ctx->space_used += len;
-	ctx->space_used -= released_space;
+	cilpcp->space_reserved += ctx_res;
+
+	/*
+	 * Accurately account when over the soft limit, otherwise fold the
+	 * percpu count into the global count if over the per-cpu threshold.
+	 */
+	if (!test_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags)) {
+		atomic_add(len, &ctx->space_used);
+	} else if (cilpcp->space_used + len >
+			(XLOG_CIL_SPACE_LIMIT(log) / num_online_cpus())) {
+		space_used = atomic_add_return(cilpcp->space_used + len,
+						&ctx->space_used);
+		cilpcp->space_used = 0;
+
+		/*
+		 * If we just transitioned over the soft limit, we need to
+		 * transition to the global atomic counter.
+		 */
+		if (space_used >= XLOG_CIL_SPACE_LIMIT(log))
+			xlog_cil_insert_pcp_aggregate(cil, ctx);
+	} else {
+		cilpcp->space_used += len;
+	}
+	/* attach the transaction to the CIL if it has any busy extents */
+	if (!list_empty(&tp->t_busy))
+		list_splice_init(&tp->t_busy, &cilpcp->busy_extents);
+
+	/*
+	 * Now update the order of everything modified in the transaction
+	 * and insert items into the CIL if they aren't already there.
+	 * We do this here so we only need to take the CIL lock once during
+	 * the transaction commit.
+	 */
+	order = atomic_inc_return(&ctx->order_id);
+	list_for_each_entry(lip, &tp->t_items, li_trans) {
+		/* Skip items which aren't dirty in this transaction. */
+		if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
+			continue;
+
+		lip->li_order_id = order;
+		if (!list_empty(&lip->li_cil))
+			continue;
+		list_add_tail(&lip->li_cil, &cilpcp->log_items);
+	}
+	put_cpu_ptr(cilpcp);
 
 	/*
 	 * If we've overrun the reservation, dump the tx details before we move
 	 * the log items. Shutdown is imminent...
 	 */
+	tp->t_ticket->t_curr_res -= ctx_res + len;
 	if (WARN_ON(tp->t_ticket->t_curr_res < 0)) {
 		xfs_warn(log->l_mp, "Transaction log reservation overrun:");
 		xfs_warn(log->l_mp,
@@ -510,44 +679,20 @@ xlog_cil_insert_items(
 			 split_res);
 		xfs_warn(log->l_mp, "  ctx ticket: %d bytes", ctx_res);
 		xlog_print_trans(tp);
-	}
-
-	/*
-	 * Now (re-)position everything modified at the tail of the CIL.
-	 * We do this here so we only need to take the CIL lock once during
-	 * the transaction commit.
-	 */
-	list_for_each_entry(lip, &tp->t_items, li_trans) {
-
-		/* Skip items which aren't dirty in this transaction. */
-		if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
-			continue;
-
-		/*
-		 * Only move the item if it isn't already at the tail. This is
-		 * to prevent a transient list_empty() state when reinserting
-		 * an item that is already the only item in the CIL.
-		 */
-		if (!list_is_last(&lip->li_cil, &cil->xc_cil))
-			list_move_tail(&lip->li_cil, &cil->xc_cil);
-	}
-
-	spin_unlock(&cil->xc_cil_lock);
-
-	if (tp->t_ticket->t_curr_res < 0)
 		xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+	}
 }
 
 static void
 xlog_cil_free_logvec(
-	struct xfs_log_vec	*log_vector)
+	struct list_head	*lv_chain)
 {
 	struct xfs_log_vec	*lv;
 
-	for (lv = log_vector; lv; ) {
-		struct xfs_log_vec *next = lv->lv_next;
+	while (!list_empty(lv_chain)) {
+		lv = list_first_entry(lv_chain, struct xfs_log_vec, lv_list);
+		list_del_init(&lv->lv_list);
 		kmem_free(lv);
-		lv = next;
 	}
 }
 
@@ -647,7 +792,7 @@ xlog_cil_committed(
 		spin_unlock(&ctx->cil->xc_push_lock);
 	}
 
-	xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
+	xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain,
 					ctx->start_lsn, abort);
 
 	xfs_extent_busy_sort(&ctx->busy_extents);
@@ -658,7 +803,7 @@ xlog_cil_committed(
 	list_del(&ctx->committing);
 	spin_unlock(&ctx->cil->xc_push_lock);
 
-	xlog_cil_free_logvec(ctx->lv_chain);
+	xlog_cil_free_logvec(&ctx->lv_chain);
 
 	if (!list_empty(&ctx->busy_extents))
 		xlog_discard_busy_extents(mp, ctx);
@@ -817,7 +962,6 @@ restart:
 static int
 xlog_cil_write_chain(
 	struct xfs_cil_ctx	*ctx,
-	struct xfs_log_vec	*chain,
 	uint32_t		chain_len)
 {
 	struct xlog		*log = ctx->cil->xc_log;
@@ -826,7 +970,7 @@ xlog_cil_write_chain(
 	error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD);
 	if (error)
 		return error;
-	return xlog_write(log, ctx, chain, ctx->ticket, chain_len);
+	return xlog_write(log, ctx, &ctx->lv_chain, ctx->ticket, chain_len);
 }
 
 /*
@@ -855,6 +999,8 @@ xlog_cil_write_commit_record(
 		.lv_iovecp = &reg,
 	};
 	int			error;
+	LIST_HEAD(lv_chain);
+	list_add(&vec.lv_list, &lv_chain);
 
 	if (xlog_is_shutdown(log))
 		return -EIO;
@@ -865,7 +1011,7 @@ xlog_cil_write_commit_record(
 
 	/* account for space used by record data */
 	ctx->ticket->t_curr_res -= reg.i_len;
-	error = xlog_write(log, ctx, &vec, ctx->ticket, reg.i_len);
+	error = xlog_write(log, ctx, &lv_chain, ctx->ticket, reg.i_len);
 	if (error)
 		xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
 	return error;
@@ -931,11 +1077,30 @@ xlog_cil_build_trans_hdr(
 	lvhdr->lv_niovecs = 2;
 	lvhdr->lv_iovecp = &hdr->lhdr[0];
 	lvhdr->lv_bytes = hdr->lhdr[0].i_len + hdr->lhdr[1].i_len;
-	lvhdr->lv_next = ctx->lv_chain;
 
 	tic->t_curr_res -= lvhdr->lv_bytes;
 }
 
+/*
+ * CIL item reordering compare function. We want to order in ascending ID order,
+ * but we want to leave items with the same ID in the order they were added to
+ * the list. This is important for operations like reflink where we log 4 order
+ * dependent intents in a single transaction when we overwrite an existing
+ * shared extent with a new shared extent. i.e. BUI(unmap), CUI(drop),
+ * CUI (inc), BUI(remap)...
+ */
+static int
+xlog_cil_order_cmp(
+	void			*priv,
+	const struct list_head	*a,
+	const struct list_head	*b)
+{
+	struct xfs_log_vec	*l1 = container_of(a, struct xfs_log_vec, lv_list);
+	struct xfs_log_vec	*l2 = container_of(b, struct xfs_log_vec, lv_list);
+
+	return l1->lv_order_id > l2->lv_order_id;
+}
+
 /*
  * Pull all the log vectors off the items in the CIL, and remove the items from
  * the CIL. We don't need the CIL lock here because it's only needed on the
@@ -947,18 +1112,16 @@ xlog_cil_build_trans_hdr(
  */
 static void
 xlog_cil_build_lv_chain(
-	struct xfs_cil		*cil,
 	struct xfs_cil_ctx	*ctx,
 	struct list_head	*whiteouts,
 	uint32_t		*num_iovecs,
 	uint32_t		*num_bytes)
 {
-	struct xfs_log_vec	*lv = NULL;
-
-	while (!list_empty(&cil->xc_cil)) {
+	while (!list_empty(&ctx->log_items)) {
 		struct xfs_log_item	*item;
+		struct xfs_log_vec	*lv;
 
-		item = list_first_entry(&cil->xc_cil,
+		item = list_first_entry(&ctx->log_items,
 					struct xfs_log_item, li_cil);
 
 		if (test_bit(XFS_LI_WHITEOUT, &item->li_flags)) {
@@ -967,18 +1130,18 @@ xlog_cil_build_lv_chain(
 			continue;
 		}
 
-		list_del_init(&item->li_cil);
-		if (!ctx->lv_chain)
-			ctx->lv_chain = item->li_lv;
-		else
-			lv->lv_next = item->li_lv;
 		lv = item->li_lv;
-		item->li_lv = NULL;
-		*num_iovecs += lv->lv_niovecs;
+		lv->lv_order_id = item->li_order_id;
 
 		/* we don't write ordered log vectors */
 		if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
 			*num_bytes += lv->lv_bytes;
+		*num_iovecs += lv->lv_niovecs;
+		list_add_tail(&lv->lv_list, &ctx->lv_chain);
+
+		list_del_init(&item->li_cil);
+		item->li_order_id = 0;
+		item->li_lv = NULL;
 	}
 }
 
@@ -1022,10 +1185,11 @@ xlog_cil_push_work(
 	int			num_bytes = 0;
 	int			error = 0;
 	struct xlog_cil_trans_hdr thdr;
-	struct xfs_log_vec	lvhdr = { NULL };
+	struct xfs_log_vec	lvhdr = {};
 	xfs_csn_t		push_seq;
 	bool			push_commit_stable;
 	LIST_HEAD		(whiteouts);
+	struct xlog_ticket	*ticket;
 
 	new_ctx = xlog_cil_ctx_alloc();
 	new_ctx->ticket = xlog_cil_ticket_alloc(log);
@@ -1049,12 +1213,14 @@ xlog_cil_push_work(
 	if (waitqueue_active(&cil->xc_push_wait))
 		wake_up_all(&cil->xc_push_wait);
 
+	xlog_cil_push_pcp_aggregate(cil, ctx);
+
 	/*
 	 * Check if we've anything to push. If there is nothing, then we don't
 	 * move on to a new sequence number and so we have to be able to push
 	 * this sequence again later.
 	 */
-	if (list_empty(&cil->xc_cil)) {
+	if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) {
 		cil->xc_push_seq = 0;
 		spin_unlock(&cil->xc_push_lock);
 		goto out_skip;
@@ -1094,7 +1260,7 @@ xlog_cil_push_work(
 	list_add(&ctx->committing, &cil->xc_committing);
 	spin_unlock(&cil->xc_push_lock);
 
-	xlog_cil_build_lv_chain(cil, ctx, &whiteouts, &num_iovecs, &num_bytes);
+	xlog_cil_build_lv_chain(ctx, &whiteouts, &num_iovecs, &num_bytes);
 
 	/*
 	 * Switch the contexts so we can drop the context lock and move out
@@ -1126,15 +1292,31 @@ xlog_cil_push_work(
 	spin_unlock(&cil->xc_push_lock);
 	up_write(&cil->xc_ctx_lock);
 
+	/*
+	 * Sort the log vector chain before we add the transaction headers.
+	 * This ensures we always have the transaction headers at the start
+	 * of the chain.
+	 */
+	list_sort(NULL, &ctx->lv_chain, xlog_cil_order_cmp);
+
 	/*
 	 * Build a checkpoint transaction header and write it to the log to
 	 * begin the transaction. We need to account for the space used by the
 	 * transaction header here as it is not accounted for in xlog_write().
+	 * Add the lvhdr to the head of the lv chain we pass to xlog_write() so
+	 * it gets written into the iclog first.
 	 */
 	xlog_cil_build_trans_hdr(ctx, &thdr, &lvhdr, num_iovecs);
 	num_bytes += lvhdr.lv_bytes;
+	list_add(&lvhdr.lv_list, &ctx->lv_chain);
 
-	error = xlog_cil_write_chain(ctx, &lvhdr, num_bytes);
+	/*
+	 * Take the lvhdr back off the lv_chain immediately after calling
+	 * xlog_cil_write_chain() as it should not be passed to log IO
+	 * completion.
+	 */
+	error = xlog_cil_write_chain(ctx, num_bytes);
+	list_del(&lvhdr.lv_list);
 	if (error)
 		goto out_abort_free_ticket;
 
@@ -1142,7 +1324,14 @@ xlog_cil_push_work(
 	if (error)
 		goto out_abort_free_ticket;
 
-	xfs_log_ticket_ungrant(log, ctx->ticket);
+	/*
+	 * Grab the ticket from the ctx so we can ungrant it after releasing the
+	 * commit_iclog. The ctx may be freed by the time we return from
+	 * releasing the commit_iclog (i.e. checkpoint has been completed and
+	 * callback run) so we can't reference the ctx after the call to
+	 * xlog_state_release_iclog().
+	 */
+	ticket = ctx->ticket;
 
 	/*
 	 * If the checkpoint spans multiple iclogs, wait for all previous iclogs
@@ -1192,12 +1381,14 @@ xlog_cil_push_work(
 	if (push_commit_stable &&
 	    ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE)
 		xlog_state_switch_iclogs(log, ctx->commit_iclog, 0);
-	xlog_state_release_iclog(log, ctx->commit_iclog);
+	ticket = ctx->ticket;
+	xlog_state_release_iclog(log, ctx->commit_iclog, ticket);
 
 	/* Not safe to reference ctx now! */
 
 	spin_unlock(&log->l_icloglock);
 	xlog_cil_cleanup_whiteouts(&whiteouts);
+	xfs_log_ticket_ungrant(log, ticket);
 	return;
 
 out_skip:
@@ -1207,17 +1398,19 @@ out_skip:
 	return;
 
 out_abort_free_ticket:
-	xfs_log_ticket_ungrant(log, ctx->ticket);
 	ASSERT(xlog_is_shutdown(log));
 	xlog_cil_cleanup_whiteouts(&whiteouts);
 	if (!ctx->commit_iclog) {
+		xfs_log_ticket_ungrant(log, ctx->ticket);
 		xlog_cil_committed(ctx);
 		return;
 	}
 	spin_lock(&log->l_icloglock);
-	xlog_state_release_iclog(log, ctx->commit_iclog);
+	ticket = ctx->ticket;
+	xlog_state_release_iclog(log, ctx->commit_iclog, ticket);
 	/* Not safe to reference ctx now! */
 	spin_unlock(&log->l_icloglock);
+	xfs_log_ticket_ungrant(log, ticket);
 }
 
 /*
@@ -1232,18 +1425,27 @@ xlog_cil_push_background(
 	struct xlog	*log) __releases(cil->xc_ctx_lock)
 {
 	struct xfs_cil	*cil = log->l_cilp;
+	int		space_used = atomic_read(&cil->xc_ctx->space_used);
 
 	/*
 	 * The cil won't be empty because we are called while holding the
-	 * context lock so whatever we added to the CIL will still be there
+	 * context lock so whatever we added to the CIL will still be there.
 	 */
-	ASSERT(!list_empty(&cil->xc_cil));
+	ASSERT(!test_bit(XLOG_CIL_EMPTY, &cil->xc_flags));
 
 	/*
-	 * Don't do a background push if we haven't used up all the
-	 * space available yet.
+	 * We are done if:
+	 * - we haven't used up all the space available yet; or
+	 * - we've already queued up a push; and
+	 * - we're not over the hard limit; and
+	 * - nothing has been over the hard limit.
+	 *
+	 * If so, we don't need to take the push lock as there's nothing to do.
 	 */
-	if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
+	if (space_used < XLOG_CIL_SPACE_LIMIT(log) ||
+	    (cil->xc_push_seq == cil->xc_current_sequence &&
+	     space_used < XLOG_CIL_BLOCKING_SPACE_LIMIT(log) &&
+	     !waitqueue_active(&cil->xc_push_wait))) {
 		up_read(&cil->xc_ctx_lock);
 		return;
 	}
@@ -1270,12 +1472,11 @@ xlog_cil_push_background(
 	 * dipping back down under the hard limit.
 	 *
 	 * The ctx->xc_push_lock provides the serialisation necessary for safely
-	 * using the lockless waitqueue_active() check in this context.
+	 * calling xlog_cil_over_hard_limit() in this context.
 	 */
-	if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) ||
-	    waitqueue_active(&cil->xc_push_wait)) {
+	if (xlog_cil_over_hard_limit(log, space_used)) {
 		trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
-		ASSERT(cil->xc_ctx->space_used < log->l_logsize);
+		ASSERT(space_used < log->l_logsize);
 		xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
 		return;
 	}
@@ -1334,7 +1535,8 @@ xlog_cil_push_now(
 	 * If the CIL is empty or we've already pushed the sequence then
 	 * there's no more work that we need to do.
 	 */
-	if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
+	if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) ||
+	    push_seq <= cil->xc_push_seq) {
 		spin_unlock(&cil->xc_push_lock);
 		return;
 	}
@@ -1352,7 +1554,7 @@ xlog_cil_empty(
 	bool		empty = false;
 
 	spin_lock(&cil->xc_push_lock);
-	if (list_empty(&cil->xc_cil))
+	if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags))
 		empty = true;
 	spin_unlock(&cil->xc_push_lock);
 	return empty;
@@ -1483,7 +1685,7 @@ xlog_cil_flush(
 	 * If the CIL is empty, make sure that any previous checkpoint that may
 	 * still be in an active iclog is pushed to stable storage.
 	 */
-	if (list_empty(&log->l_cilp->xc_cil))
+	if (test_bit(XLOG_CIL_EMPTY, &log->l_cilp->xc_flags))
 		xfs_log_force(log->l_mp, 0);
 }
 
@@ -1568,7 +1770,7 @@ restart:
 	 * we would have found the context on the committing list.
 	 */
 	if (sequence == cil->xc_current_sequence &&
-	    !list_empty(&cil->xc_cil)) {
+	    !test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) {
 		spin_unlock(&cil->xc_push_lock);
 		goto restart;
 	}
@@ -1588,15 +1790,49 @@ out_shutdown:
 	return 0;
 }
 
+/*
+ * Move dead percpu state to the relevant CIL context structures.
+ *
+ * We have to lock the CIL context here to ensure that nothing is modifying
+ * the percpu state, either addition or removal. Both of these are done under
+ * the CIL context lock, so grabbing that exclusively here will ensure we can
+ * safely drain the cilpcp for the CPU that is dying.
+ */
+void
+xlog_cil_pcp_dead(
+	struct xlog		*log,
+	unsigned int		cpu)
+{
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xlog_cil_pcp	*cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+	struct xfs_cil_ctx	*ctx;
+
+	down_write(&cil->xc_ctx_lock);
+	ctx = cil->xc_ctx;
+	if (ctx->ticket)
+		ctx->ticket->t_curr_res += cilpcp->space_reserved;
+	cilpcp->space_reserved = 0;
+
+	if (!list_empty(&cilpcp->log_items))
+		list_splice_init(&cilpcp->log_items, &ctx->log_items);
+	if (!list_empty(&cilpcp->busy_extents))
+		list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents);
+	atomic_add(cilpcp->space_used, &ctx->space_used);
+	cilpcp->space_used = 0;
+	up_write(&cil->xc_ctx_lock);
+}
+
 /*
  * Perform initial CIL structure initialisation.
  */
 int
 xlog_cil_init(
-	struct xlog	*log)
+	struct xlog		*log)
 {
-	struct xfs_cil	*cil;
-	struct xfs_cil_ctx *ctx;
+	struct xfs_cil		*cil;
+	struct xfs_cil_ctx	*ctx;
+	struct xlog_cil_pcp	*cilpcp;
+	int			cpu;
 
 	cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
 	if (!cil)
@@ -1611,22 +1847,31 @@ xlog_cil_init(
 	if (!cil->xc_push_wq)
 		goto out_destroy_cil;
 
-	INIT_LIST_HEAD(&cil->xc_cil);
+	cil->xc_log = log;
+	cil->xc_pcp = alloc_percpu(struct xlog_cil_pcp);
+	if (!cil->xc_pcp)
+		goto out_destroy_wq;
+
+	for_each_possible_cpu(cpu) {
+		cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+		INIT_LIST_HEAD(&cilpcp->busy_extents);
+		INIT_LIST_HEAD(&cilpcp->log_items);
+	}
+
 	INIT_LIST_HEAD(&cil->xc_committing);
-	spin_lock_init(&cil->xc_cil_lock);
 	spin_lock_init(&cil->xc_push_lock);
 	init_waitqueue_head(&cil->xc_push_wait);
 	init_rwsem(&cil->xc_ctx_lock);
 	init_waitqueue_head(&cil->xc_start_wait);
 	init_waitqueue_head(&cil->xc_commit_wait);
-	cil->xc_log = log;
 	log->l_cilp = cil;
 
 	ctx = xlog_cil_ctx_alloc();
 	xlog_cil_ctx_switch(cil, ctx);
-
 	return 0;
 
+out_destroy_wq:
+	destroy_workqueue(cil->xc_push_wq);
 out_destroy_cil:
 	kmem_free(cil);
 	return -ENOMEM;
@@ -1636,14 +1881,17 @@ void
 xlog_cil_destroy(
 	struct xlog	*log)
 {
-	if (log->l_cilp->xc_ctx) {
-		if (log->l_cilp->xc_ctx->ticket)
-			xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
-		kmem_free(log->l_cilp->xc_ctx);
+	struct xfs_cil	*cil = log->l_cilp;
+
+	if (cil->xc_ctx) {
+		if (cil->xc_ctx->ticket)
+			xfs_log_ticket_put(cil->xc_ctx->ticket);
+		kmem_free(cil->xc_ctx);
 	}
 
-	ASSERT(list_empty(&log->l_cilp->xc_cil));
-	destroy_workqueue(log->l_cilp->xc_push_wq);
-	kmem_free(log->l_cilp);
+	ASSERT(test_bit(XLOG_CIL_EMPTY, &cil->xc_flags));
+	free_percpu(cil->xc_pcp);
+	destroy_workqueue(cil->xc_push_wq);
+	kmem_free(cil);
 }
 
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 686c01eb3661..1bd2963e8fbd 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -143,15 +143,16 @@ enum xlog_iclog_state {
 #define XLOG_COVER_OPS		5
 
 typedef struct xlog_ticket {
-	struct list_head   t_queue;	 /* reserve/write queue */
-	struct task_struct *t_task;	 /* task that owns this ticket */
-	xlog_tid_t	   t_tid;	 /* transaction identifier	 : 4  */
-	atomic_t	   t_ref;	 /* ticket reference count       : 4  */
-	int		   t_curr_res;	 /* current reservation in bytes : 4  */
-	int		   t_unit_res;	 /* unit reservation in bytes    : 4  */
-	char		   t_ocnt;	 /* original count		 : 1  */
-	char		   t_cnt;	 /* current count		 : 1  */
-	uint8_t		   t_flags;	 /* properties of reservation	 : 1  */
+	struct list_head	t_queue;	/* reserve/write queue */
+	struct task_struct	*t_task;	/* task that owns this ticket */
+	xlog_tid_t		t_tid;		/* transaction identifier */
+	atomic_t		t_ref;		/* ticket reference count */
+	int			t_curr_res;	/* current reservation */
+	int			t_unit_res;	/* unit reservation */
+	char			t_ocnt;		/* original unit count */
+	char			t_cnt;		/* current unit count */
+	uint8_t			t_flags;	/* properties of reservation */
+	int			t_iclog_hdrs;	/* iclog hdrs in t_curr_res */
 } xlog_ticket_t;
 
 /*
@@ -221,13 +222,25 @@ struct xfs_cil_ctx {
 	xfs_lsn_t		commit_lsn;	/* chkpt commit record lsn */
 	struct xlog_in_core	*commit_iclog;
 	struct xlog_ticket	*ticket;	/* chkpt ticket */
-	int			space_used;	/* aggregate size of regions */
+	atomic_t		space_used;	/* aggregate size of regions */
 	struct list_head	busy_extents;	/* busy extents in chkpt */
-	struct xfs_log_vec	*lv_chain;	/* logvecs being pushed */
+	struct list_head	log_items;	/* log items in chkpt */
+	struct list_head	lv_chain;	/* logvecs being pushed */
 	struct list_head	iclog_entry;
 	struct list_head	committing;	/* ctx committing list */
 	struct work_struct	discard_endio_work;
 	struct work_struct	push_work;
+	atomic_t		order_id;
+};
+
+/*
+ * Per-cpu CIL tracking items
+ */
+struct xlog_cil_pcp {
+	int32_t			space_used;
+	uint32_t		space_reserved;
+	struct list_head	busy_extents;
+	struct list_head	log_items;
 };
 
 /*
@@ -248,8 +261,8 @@ struct xfs_cil_ctx {
  */
 struct xfs_cil {
 	struct xlog		*xc_log;
-	struct list_head	xc_cil;
-	spinlock_t		xc_cil_lock;
+	unsigned long		xc_flags;
+	atomic_t		xc_iclog_hdrs;
 	struct workqueue_struct	*xc_push_wq;
 
 	struct rw_semaphore	xc_ctx_lock ____cacheline_aligned_in_smp;
@@ -263,8 +276,17 @@ struct xfs_cil {
 	wait_queue_head_t	xc_start_wait;
 	xfs_csn_t		xc_current_sequence;
 	wait_queue_head_t	xc_push_wait;	/* background push throttle */
+
+	void __percpu		*xc_pcp;	/* percpu CIL structures */
+#ifdef CONFIG_HOTPLUG_CPU
+	struct list_head	xc_pcp_list;
+#endif
 } ____cacheline_aligned_in_smp;
 
+/* xc_flags bit values */
+#define	XLOG_CIL_EMPTY		1
+#define XLOG_CIL_PCP_SPACE	2
+
 /*
  * The amount of log space we allow the CIL to aggregate is difficult to size.
  * Whatever we choose, we have to make sure we can get a reservation for the
@@ -486,14 +508,15 @@ struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes,
 void	xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
 void	xlog_print_trans(struct xfs_trans *);
 int	xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx,
-		struct xfs_log_vec *log_vector, struct xlog_ticket *tic,
+		struct list_head *lv_chain, struct xlog_ticket *tic,
 		uint32_t len);
 void	xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
 void	xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
 
 void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog,
 		int eventual_size);
-int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog);
+int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog,
+		struct xlog_ticket *ticket);
 
 /*
  * When we crack an atomic LSN, we sample it first so that the value will not
@@ -682,4 +705,9 @@ xlog_kvmalloc(
 	return p;
 }
 
+/*
+ * CIL CPU dead notifier
+ */
+void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu);
+
 #endif	/* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 940c8107cbd4..17e923b9c5fa 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2629,21 +2629,21 @@ xlog_recover_cancel_intents(
  */
 STATIC void
 xlog_recover_clear_agi_bucket(
-	xfs_mount_t	*mp,
-	xfs_agnumber_t	agno,
-	int		bucket)
+	struct xfs_perag	*pag,
+	int			bucket)
 {
-	xfs_trans_t	*tp;
-	xfs_agi_t	*agi;
-	struct xfs_buf	*agibp;
-	int		offset;
-	int		error;
+	struct xfs_mount	*mp = pag->pag_mount;
+	struct xfs_trans	*tp;
+	struct xfs_agi		*agi;
+	struct xfs_buf		*agibp;
+	int			offset;
+	int			error;
 
 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
 	if (error)
 		goto out_error;
 
-	error = xfs_read_agi(mp, tp, agno, &agibp);
+	error = xfs_read_agi(pag, tp, &agibp);
 	if (error)
 		goto out_abort;
 
@@ -2662,60 +2662,62 @@ xlog_recover_clear_agi_bucket(
 out_abort:
 	xfs_trans_cancel(tp);
 out_error:
-	xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
+	xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__,
+			pag->pag_agno);
 	return;
 }
 
-STATIC xfs_agino_t
-xlog_recover_process_one_iunlink(
-	struct xfs_mount		*mp,
-	xfs_agnumber_t			agno,
-	xfs_agino_t			agino,
-	int				bucket)
+static int
+xlog_recover_iunlink_bucket(
+	struct xfs_perag	*pag,
+	struct xfs_agi		*agi,
+	int			bucket)
 {
-	struct xfs_buf			*ibp;
-	struct xfs_dinode		*dip;
-	struct xfs_inode		*ip;
-	xfs_ino_t			ino;
-	int				error;
+	struct xfs_mount	*mp = pag->pag_mount;
+	struct xfs_inode	*prev_ip = NULL;
+	struct xfs_inode	*ip;
+	xfs_agino_t		prev_agino, agino;
+	int			error = 0;
 
-	ino = XFS_AGINO_TO_INO(mp, agno, agino);
-	error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
-	if (error)
-		goto fail;
+	agino = be32_to_cpu(agi->agi_unlinked[bucket]);
+	while (agino != NULLAGINO) {
+		error = xfs_iget(mp, NULL,
+				XFS_AGINO_TO_INO(mp, pag->pag_agno, agino),
+				0, 0, &ip);
+		if (error)
+			break;
 
-	/*
-	 * Get the on disk inode to find the next inode in the bucket.
-	 */
-	error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &ibp);
-	if (error)
-		goto fail_iput;
-	dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset);
+		ASSERT(VFS_I(ip)->i_nlink == 0);
+		ASSERT(VFS_I(ip)->i_mode != 0);
+		xfs_iflags_clear(ip, XFS_IRECOVERY);
+		agino = ip->i_next_unlinked;
 
-	xfs_iflags_clear(ip, XFS_IRECOVERY);
-	ASSERT(VFS_I(ip)->i_nlink == 0);
-	ASSERT(VFS_I(ip)->i_mode != 0);
+		if (prev_ip) {
+			ip->i_prev_unlinked = prev_agino;
+			xfs_irele(prev_ip);
 
-	/* setup for the next pass */
-	agino = be32_to_cpu(dip->di_next_unlinked);
-	xfs_buf_relse(ibp);
+			/*
+			 * Ensure the inode is removed from the unlinked list
+			 * before we continue so that it won't race with
+			 * building the in-memory list here. This could be
+			 * serialised with the agibp lock, but that just
+			 * serialises via lockstepping and it's much simpler
+			 * just to flush the inodegc queue and wait for it to
+			 * complete.
+			 */
+			xfs_inodegc_flush(mp);
+		}
 
-	xfs_irele(ip);
-	return agino;
+		prev_agino = agino;
+		prev_ip = ip;
+	}
 
- fail_iput:
-	xfs_irele(ip);
- fail:
-	/*
-	 * We can't read in the inode this bucket points to, or this inode
-	 * is messed up.  Just ditch this bucket of inodes.  We will lose
-	 * some inodes and space, but at least we won't hang.
-	 *
-	 * Call xlog_recover_clear_agi_bucket() to perform a transaction to
-	 * clear the inode pointer in the bucket.
-	 */
-	xlog_recover_clear_agi_bucket(mp, agno, bucket);
-	return NULLAGINO;
+	if (prev_ip) {
+		ip->i_prev_unlinked = prev_agino;
+		xfs_irele(prev_ip);
+	}
+	xfs_inodegc_flush(mp);
+	return error;
 }
 
 /*
@@ -2741,59 +2743,70 @@ xlog_recover_process_one_iunlink(
  * scheduled on this CPU to ensure other scheduled work can run without undue
  * latency.
  */
-STATIC void
-xlog_recover_process_iunlinks(
-	struct xlog	*log)
+static void
+xlog_recover_iunlink_ag(
+	struct xfs_perag	*pag)
 {
-	struct xfs_mount	*mp = log->l_mp;
-	struct xfs_perag	*pag;
-	xfs_agnumber_t		agno;
 	struct xfs_agi		*agi;
 	struct xfs_buf		*agibp;
-	xfs_agino_t		agino;
 	int			bucket;
 	int			error;
 
-	for_each_perag(mp, agno, pag) {
-		error = xfs_read_agi(mp, NULL, pag->pag_agno, &agibp);
+	error = xfs_read_agi(pag, NULL, &agibp);
+	if (error) {
+		/*
+		 * AGI is b0rked. Don't process it.
+		 *
+		 * We should probably mark the filesystem as corrupt after we've
+		 * recovered all the ag's we can....
+		 */
+		return;
+	}
+
+	/*
+	 * Unlock the buffer so that it can be acquired in the normal course of
+	 * the transaction to truncate and free each inode.  Because we are not
+	 * racing with anyone else here for the AGI buffer, we don't even need
+	 * to hold it locked to read the initial unlinked bucket entries out of
+	 * the buffer. We keep buffer reference though, so that it stays pinned
+	 * in memory while we need the buffer.
+	 */
+	agi = agibp->b_addr;
+	xfs_buf_unlock(agibp);
+
+	for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
+		error = xlog_recover_iunlink_bucket(pag, agi, bucket);
 		if (error) {
 			/*
-			 * AGI is b0rked. Don't process it.
-			 *
-			 * We should probably mark the filesystem as corrupt
-			 * after we've recovered all the ag's we can....
+			 * Bucket is unrecoverable, so only a repair scan can
+			 * free the remaining unlinked inodes. Just empty the
+			 * bucket and remaining inodes on it unreferenced and
+			 * unfreeable.
 			 */
-			continue;
+			xfs_inodegc_flush(pag->pag_mount);
+			xlog_recover_clear_agi_bucket(pag, bucket);
 		}
-		/*
-		 * Unlock the buffer so that it can be acquired in the normal
-		 * course of the transaction to truncate and free each inode.
-		 * Because we are not racing with anyone else here for the AGI
-		 * buffer, we don't even need to hold it locked to read the
-		 * initial unlinked bucket entries out of the buffer. We keep
-		 * buffer reference though, so that it stays pinned in memory
-		 * while we need the buffer.
-		 */
-		agi = agibp->b_addr;
-		xfs_buf_unlock(agibp);
-
-		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
-			agino = be32_to_cpu(agi->agi_unlinked[bucket]);
-			while (agino != NULLAGINO) {
-				agino = xlog_recover_process_one_iunlink(mp,
-						pag->pag_agno, agino, bucket);
-				cond_resched();
-			}
-		}
-		xfs_buf_rele(agibp);
 	}
 
+	xfs_buf_rele(agibp);
+}
+
+static void
+xlog_recover_process_iunlinks(
+	struct xlog	*log)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+
+	for_each_perag(log->l_mp, agno, pag)
+		xlog_recover_iunlink_ag(pag);
+
 	/*
 	 * Flush the pending unlinked inodes to ensure that the inactivations
 	 * are fully completed on disk and the incore inodes can be reclaimed
 	 * before we signal that recovery is complete.
 	 */
-	xfs_inodegc_flush(mp);
+	xfs_inodegc_flush(log->l_mp);
 }
 
 STATIC void
@@ -3313,7 +3326,8 @@ xlog_do_recover(
 	/* re-initialise in-core superblock and geometry structures */
 	mp->m_features |= xfs_sb_version_to_features(sbp);
 	xfs_reinit_percpu_counters(mp);
-	error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
+	error = xfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks,
+			&mp->m_maxagi);
 	if (error) {
 		xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
 		return error;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index daa8d29c46b4..f10c88cee116 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -778,7 +778,8 @@ xfs_mountfs(
 	/*
 	 * Allocate and initialize the per-ag data.
 	 */
-	error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
+	error = xfs_initialize_perag(mp, sbp->sb_agcount, mp->m_sb.sb_dblocks,
+			&mp->m_maxagi);
 	if (error) {
 		xfs_warn(mp, "Failed per-ag init: %d", error);
 		goto out_free_dir;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index abf08bbf34a9..57dd3b722265 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1154,7 +1154,7 @@ xfs_qm_dqusage_adjust(
 	ASSERT(ip->i_delayed_blks == 0);
 
 	if (XFS_IS_REALTIME_INODE(ip)) {
-		struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+		struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
 
 		error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
 		if (error)
@@ -1229,12 +1229,11 @@ xfs_qm_flush_one(
 	 */
 	if (!xfs_dqflock_nowait(dqp)) {
 		/* buf is pinned in-core by delwri list */
-		bp = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno,
-				mp->m_quotainfo->qi_dqchunklen, 0);
-		if (!bp) {
-			error = -EINVAL;
+		error = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno,
+				mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+		if (error)
 			goto out_unlock;
-		}
+
 		xfs_buf_unlock(bp);
 
 		xfs_buf_delwri_pushbuf(bp, buffer_list);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index e7a7c00d93be..724806c7ce3e 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -125,11 +125,10 @@
  * shared blocks.  If there are no shared extents, fbno and flen will
  * be set to NULLAGBLOCK and 0, respectively.
  */
-int
+static int
 xfs_reflink_find_shared(
-	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
 	struct xfs_trans	*tp,
-	xfs_agnumber_t		agno,
 	xfs_agblock_t		agbno,
 	xfs_extlen_t		aglen,
 	xfs_agblock_t		*fbno,
@@ -140,11 +139,11 @@ xfs_reflink_find_shared(
 	struct xfs_btree_cur	*cur;
 	int			error;
 
-	error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
 	if (error)
 		return error;
 
-	cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agbp->b_pag);
+	cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag);
 
 	error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
 			find_end_of_shared);
@@ -171,7 +170,8 @@ xfs_reflink_trim_around_shared(
 	struct xfs_bmbt_irec	*irec,
 	bool			*shared)
 {
-	xfs_agnumber_t		agno;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_perag	*pag;
 	xfs_agblock_t		agbno;
 	xfs_extlen_t		aglen;
 	xfs_agblock_t		fbno;
@@ -186,12 +186,13 @@ xfs_reflink_trim_around_shared(
 
 	trace_xfs_reflink_trim_around_shared(ip, irec);
 
-	agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock);
-	agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
+	pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock));
+	agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
 	aglen = irec->br_blockcount;
 
-	error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno,
-			aglen, &fbno, &flen, true);
+	error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen,
+			true);
+	xfs_perag_put(pag);
 	if (error)
 		return error;
 
@@ -452,7 +453,7 @@ xfs_reflink_cancel_cow_blocks(
 	xfs_fileoff_t			end_fsb,
 	bool				cancel_real)
 {
-	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
 	struct xfs_bmbt_irec		got, del;
 	struct xfs_iext_cursor		icur;
 	int				error = 0;
@@ -593,7 +594,7 @@ xfs_reflink_end_cow_extent(
 	struct xfs_bmbt_irec	got, del, data;
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_trans	*tp;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
 	unsigned int		resblks;
 	int			nmaps;
 	int			error;
@@ -1420,16 +1421,11 @@ xfs_reflink_inode_has_shared_extents(
 	struct xfs_bmbt_irec		got;
 	struct xfs_mount		*mp = ip->i_mount;
 	struct xfs_ifork		*ifp;
-	xfs_agnumber_t			agno;
-	xfs_agblock_t			agbno;
-	xfs_extlen_t			aglen;
-	xfs_agblock_t			rbno;
-	xfs_extlen_t			rlen;
 	struct xfs_iext_cursor		icur;
 	bool				found;
 	int				error;
 
-	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
 	error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
 	if (error)
 		return error;
@@ -1437,17 +1433,25 @@ xfs_reflink_inode_has_shared_extents(
 	*has_shared = false;
 	found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
 	while (found) {
+		struct xfs_perag	*pag;
+		xfs_agblock_t		agbno;
+		xfs_extlen_t		aglen;
+		xfs_agblock_t		rbno;
+		xfs_extlen_t		rlen;
+
 		if (isnullstartblock(got.br_startblock) ||
 		    got.br_state != XFS_EXT_NORM)
 			goto next;
-		agno = XFS_FSB_TO_AGNO(mp, got.br_startblock);
+
+		pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock));
 		agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
 		aglen = got.br_blockcount;
-
-		error = xfs_reflink_find_shared(mp, tp, agno, agbno, aglen,
+		error = xfs_reflink_find_shared(pag, tp, agbno, aglen,
 				&rbno, &rlen, false);
+		xfs_perag_put(pag);
 		if (error)
 			return error;
+
 		/* Is there still a shared block here? */
 		if (rbno != NULLAGBLOCK) {
 			*has_shared = true;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index bea65f2fe657..65c5dfe17ecf 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -16,9 +16,6 @@ static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
 	return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
 }
 
-extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
-		xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
-		xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
 extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
 		struct xfs_bmbt_irec *irec, bool *shared);
 int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index aa977c7ea370..3d27ba1295c9 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -40,6 +40,7 @@
 #include "xfs_defer.h"
 #include "xfs_attr_item.h"
 #include "xfs_xattr.h"
+#include "xfs_iunlink_item.h"
 
 #include <linux/magic.h>
 #include <linux/fs_context.h>
@@ -1966,11 +1967,19 @@ xfs_init_caches(void)
 {
 	int		error;
 
+	xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
+					 SLAB_HWCACHE_ALIGN |
+					 SLAB_RECLAIM_ACCOUNT |
+					 SLAB_MEM_SPREAD,
+					 NULL);
+	if (!xfs_buf_cache)
+		goto out;
+
 	xfs_log_ticket_cache = kmem_cache_create("xfs_log_ticket",
 						sizeof(struct xlog_ticket),
 						0, 0, NULL);
 	if (!xfs_log_ticket_cache)
-		goto out;
+		goto out_destroy_buf_cache;
 
 	error = xfs_btree_init_cur_caches();
 	if (error)
@@ -2096,8 +2105,16 @@ xfs_init_caches(void)
 	if (!xfs_attri_cache)
 		goto out_destroy_attrd_cache;
 
+	xfs_iunlink_cache = kmem_cache_create("xfs_iul_item",
+					     sizeof(struct xfs_iunlink_item),
+					     0, 0, NULL);
+	if (!xfs_iunlink_cache)
+		goto out_destroy_attri_cache;
+
 	return 0;
 
+ out_destroy_attri_cache:
+	kmem_cache_destroy(xfs_attri_cache);
  out_destroy_attrd_cache:
 	kmem_cache_destroy(xfs_attrd_cache);
  out_destroy_bui_cache:
@@ -2136,6 +2153,8 @@ xfs_init_caches(void)
 	xfs_btree_destroy_cur_caches();
  out_destroy_log_ticket_cache:
 	kmem_cache_destroy(xfs_log_ticket_cache);
+ out_destroy_buf_cache:
+	kmem_cache_destroy(xfs_buf_cache);
  out:
 	return -ENOMEM;
 }
@@ -2148,6 +2167,7 @@ xfs_destroy_caches(void)
 	 * destroy caches.
 	 */
 	rcu_barrier();
+	kmem_cache_destroy(xfs_iunlink_cache);
 	kmem_cache_destroy(xfs_attri_cache);
 	kmem_cache_destroy(xfs_attrd_cache);
 	kmem_cache_destroy(xfs_bui_cache);
@@ -2168,6 +2188,7 @@ xfs_destroy_caches(void)
 	xfs_defer_destroy_item_caches();
 	xfs_btree_destroy_cur_caches();
 	kmem_cache_destroy(xfs_log_ticket_cache);
+	kmem_cache_destroy(xfs_buf_cache);
 }
 
 STATIC int __init
@@ -2213,6 +2234,7 @@ xfs_cpu_dead(
 	list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) {
 		spin_unlock(&xfs_mount_list_lock);
 		xfs_inodegc_cpu_dead(mp, cpu);
+		xlog_cil_pcp_dead(mp->m_log, cpu);
 		spin_lock(&xfs_mount_list_lock);
 	}
 	spin_unlock(&xfs_mount_list_lock);
@@ -2272,13 +2294,9 @@ init_xfs_fs(void)
 	if (error)
 		goto out_destroy_wq;
 
-	error = xfs_buf_init();
-	if (error)
-		goto out_mru_cache_uninit;
-
 	error = xfs_init_procfs();
 	if (error)
-		goto out_buf_terminate;
+		goto out_mru_cache_uninit;
 
 	error = xfs_sysctl_register();
 	if (error)
@@ -2335,8 +2353,6 @@ init_xfs_fs(void)
 	xfs_sysctl_unregister();
  out_cleanup_procfs:
 	xfs_cleanup_procfs();
- out_buf_terminate:
-	xfs_buf_terminate();
  out_mru_cache_uninit:
 	xfs_mru_cache_uninit();
  out_destroy_wq:
@@ -2362,7 +2378,6 @@ exit_xfs_fs(void)
 	kset_unregister(xfs_kset);
 	xfs_sysctl_unregister();
 	xfs_cleanup_procfs();
-	xfs_buf_terminate();
 	xfs_mru_cache_uninit();
 	xfs_destroy_workqueues();
 	xfs_destroy_caches();
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 4145ba872547..8389f3ef88ef 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -256,7 +256,7 @@ xfs_symlink(
 	/*
 	 * If the symlink will fit into the inode, write it inline.
 	 */
-	if (pathlen <= XFS_IFORK_DSIZE(ip)) {
+	if (pathlen <= xfs_inode_data_fork_size(ip)) {
 		xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
 
 		ip->i_disk_size = pathlen;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 0fa1b7a2918c..f9057af6e0c8 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2171,7 +2171,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
 		__entry->format = ip->i_df.if_format;
 		__entry->nex = ip->i_df.if_nextents;
 		__entry->broot_size = ip->i_df.if_broot_bytes;
-		__entry->fork_off = XFS_IFORK_BOFF(ip);
+		__entry->fork_off = xfs_inode_fork_boff(ip);
 	),
 	TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %llu, "
 		  "broot size %d, forkoff 0x%x",
@@ -3672,7 +3672,6 @@ DEFINE_EVENT(xfs_ag_inode_class, name, \
 	TP_ARGS(ip))
 DEFINE_AGINODE_EVENT(xfs_iunlink);
 DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
-DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
 
 DECLARE_EVENT_CLASS(xfs_fs_corrupt_class,
 	TP_PROTO(struct xfs_mount *mp, unsigned int flags),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 82cf0189c0db..7bd16fbff534 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -760,7 +760,7 @@ xfs_log_item_batch_insert(
 void
 xfs_trans_committed_bulk(
 	struct xfs_ail		*ailp,
-	struct xfs_log_vec	*log_vector,
+	struct list_head	*lv_chain,
 	xfs_lsn_t		commit_lsn,
 	bool			aborted)
 {
@@ -775,7 +775,7 @@ xfs_trans_committed_bulk(
 	spin_unlock(&ailp->ail_lock);
 
 	/* unpin all the log items */
-	for (lv = log_vector; lv; lv = lv->lv_next ) {
+	list_for_each_entry(lv, lv_chain, lv_list) {
 		struct xfs_log_item	*lip = lv->lv_item;
 		xfs_lsn_t		item_lsn;
 
@@ -844,6 +844,90 @@ xfs_trans_committed_bulk(
 	spin_unlock(&ailp->ail_lock);
 }
 
+/*
+ * Sort transaction items prior to running precommit operations. This will
+ * attempt to order the items such that they will always be locked in the same
+ * order. Items that have no sort function are moved to the end of the list
+ * and so are locked last.
+ *
+ * This may need refinement as different types of objects add sort functions.
+ *
+ * Function is more complex than it needs to be because we are comparing 64 bit
+ * values and the function only returns 32 bit values.
+ */
+static int
+xfs_trans_precommit_sort(
+	void			*unused_arg,
+	const struct list_head	*a,
+	const struct list_head	*b)
+{
+	struct xfs_log_item	*lia = container_of(a,
+					struct xfs_log_item, li_trans);
+	struct xfs_log_item	*lib = container_of(b,
+					struct xfs_log_item, li_trans);
+	int64_t			diff;
+
+	/*
+	 * If both items are non-sortable, leave them alone. If only one is
+	 * sortable, move the non-sortable item towards the end of the list.
+	 */
+	if (!lia->li_ops->iop_sort && !lib->li_ops->iop_sort)
+		return 0;
+	if (!lia->li_ops->iop_sort)
+		return 1;
+	if (!lib->li_ops->iop_sort)
+		return -1;
+
+	diff = lia->li_ops->iop_sort(lia) - lib->li_ops->iop_sort(lib);
+	if (diff < 0)
+		return -1;
+	if (diff > 0)
+		return 1;
+	return 0;
+}
+
+/*
+ * Run transaction precommit functions.
+ *
+ * If there is an error in any of the callouts, then stop immediately and
+ * trigger a shutdown to abort the transaction. There is no recovery possible
+ * from errors at this point as the transaction is dirty....
+ */
+static int
+xfs_trans_run_precommits(
+	struct xfs_trans	*tp)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_log_item	*lip, *n;
+	int			error = 0;
+
+	/*
+	 * Sort the item list to avoid ABBA deadlocks with other transactions
+	 * running precommit operations that lock multiple shared items such as
+	 * inode cluster buffers.
+	 */
+	list_sort(NULL, &tp->t_items, xfs_trans_precommit_sort);
+
+	/*
+	 * Precommit operations can remove the log item from the transaction
+	 * if the log item exists purely to delay modifications until they
+	 * can be ordered against other operations. Hence we have to use
+	 * list_for_each_entry_safe() here.
+	 */
+	list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
+		if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
+			continue;
+		if (lip->li_ops->iop_precommit) {
+			error = lip->li_ops->iop_precommit(tp, lip);
+			if (error)
+				break;
+		}
+	}
+	if (error)
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+	return error;
+}
+
 /*
  * Commit the given transaction to the log.
  *
@@ -869,6 +953,13 @@ __xfs_trans_commit(
 
 	trace_xfs_trans_commit(tp, _RET_IP_);
 
+	error = xfs_trans_run_precommits(tp);
+	if (error) {
+		if (tp->t_flags & XFS_TRANS_PERM_LOG_RES)
+			xfs_defer_cancel(tp);
+		goto out_unreserve;
+	}
+
 	/*
 	 * Finish deferred items on final commit. Only permanent transactions
 	 * should ever have deferred ops.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 9561f193e7e1..55819785941c 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -45,6 +45,7 @@ struct xfs_log_item {
 	struct xfs_log_vec		*li_lv;		/* active log vector */
 	struct xfs_log_vec		*li_lv_shadow;	/* standby vector */
 	xfs_csn_t			li_seq;		/* CIL commit seq */
+	uint32_t			li_order_id;	/* CIL commit order */
 };
 
 /*
@@ -71,10 +72,12 @@ struct xfs_item_ops {
 	void (*iop_format)(struct xfs_log_item *, struct xfs_log_vec *);
 	void (*iop_pin)(struct xfs_log_item *);
 	void (*iop_unpin)(struct xfs_log_item *, int remove);
-	uint (*iop_push)(struct xfs_log_item *, struct list_head *);
+	uint64_t (*iop_sort)(struct xfs_log_item *lip);
+	int (*iop_precommit)(struct xfs_trans *tp, struct xfs_log_item *lip);
 	void (*iop_committing)(struct xfs_log_item *lip, xfs_csn_t seq);
-	void (*iop_release)(struct xfs_log_item *);
 	xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
+	uint (*iop_push)(struct xfs_log_item *, struct list_head *);
+	void (*iop_release)(struct xfs_log_item *);
 	int (*iop_recover)(struct xfs_log_item *lip,
 			   struct list_head *capture_list);
 	bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index f0d79a9050ba..d5400150358e 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -19,7 +19,8 @@ void	xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void	xfs_trans_del_item(struct xfs_log_item *);
 void	xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
 
-void	xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
+void	xfs_trans_committed_bulk(struct xfs_ail *ailp,
+				struct list_head *lv_chain,
 				xfs_lsn_t commit_lsn, bool aborted);
 /*
  * AIL traversal cursor.
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index f140e4643949..f5841992dc9b 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -718,6 +718,8 @@ static inline void ahash_request_set_crypt(struct ahash_request *req,
 struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
 					u32 mask);
 
+int crypto_has_shash(const char *alg_name, u32 type, u32 mask);
+
 static inline struct crypto_tfm *crypto_shash_tfm(struct crypto_shash *tfm)
 {
 	return &tfm->base;
diff --git a/include/crypto/kpp.h b/include/crypto/kpp.h
index cccceadc164b..24d01e9877c1 100644
--- a/include/crypto/kpp.h
+++ b/include/crypto/kpp.h
@@ -104,6 +104,8 @@ struct kpp_alg {
  */
 struct crypto_kpp *crypto_alloc_kpp(const char *alg_name, u32 type, u32 mask);
 
+int crypto_has_kpp(const char *alg_name, u32 type, u32 mask);
+
 static inline struct crypto_tfm *crypto_kpp_tfm(struct crypto_kpp *tfm)
 {
 	return &tfm->base;
diff --git a/include/linux/base64.h b/include/linux/base64.h
new file mode 100644
index 000000000000..660d4cb1ef31
--- /dev/null
+++ b/include/linux/base64.h
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * base64 encoding, lifted from fs/crypto/fname.c.
+ */
+
+#ifndef _LINUX_BASE64_H
+#define _LINUX_BASE64_H
+
+#include <linux/types.h>
+
+#define BASE64_CHARS(nbytes)   DIV_ROUND_UP((nbytes) * 4, 3)
+
+int base64_encode(const u8 *src, int len, char *dst);
+int base64_decode(const char *src, int len, u8 *dst);
+
+#endif /* _LINUX_BASE64_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index dccdf1551c62..84b13fdd34a7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -140,6 +140,8 @@ struct gendisk {
 	struct request_queue *queue;
 	void *private_data;
 
+	struct bio_set bio_split;
+
 	int flags;
 	unsigned long state;
 #define GD_NEED_PART_SCAN		0
@@ -531,7 +533,6 @@ struct request_queue {
 
 	struct blk_mq_tag_set	*tag_set;
 	struct list_head	tag_set_list;
-	struct bio_set		bio_split;
 
 	struct dentry		*debugfs_dir;
 	struct dentry		*sched_debugfs_dir;
@@ -864,9 +865,9 @@ void blk_request_module(dev_t devt);
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 void submit_bio_noacct(struct bio *bio);
+struct bio *bio_split_to_limits(struct bio *bio);
 
 extern int blk_lld_busy(struct request_queue *q);
-extern void blk_queue_split(struct bio **);
 extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
 extern void blk_queue_exit(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index dc1724131300..0b7242370b56 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -54,14 +54,13 @@
  * CONFIG_JBD2_DEBUG is on.
  */
 #define JBD2_EXPENSIVE_CHECKING
-extern ushort jbd2_journal_enable_debug;
 void __jbd2_debug(int level, const char *file, const char *func,
 		  unsigned int line, const char *fmt, ...);
 
-#define jbd_debug(n, fmt, a...) \
+#define jbd2_debug(n, fmt, a...) \
 	__jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a)
 #else
-#define jbd_debug(n, fmt, a...)  no_printk(fmt, ##a)
+#define jbd2_debug(n, fmt, a...)  no_printk(fmt, ##a)
 #endif
 
 extern void *jbd2_alloc(size_t size, gfp_t flags);
@@ -1647,7 +1646,6 @@ extern void	jbd2_clear_buffer_revoked_flags(journal_t *journal);
  */
 
 int jbd2_log_start_commit(journal_t *journal, tid_t tid);
-int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
 int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
 int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
 int jbd2_transaction_committed(journal_t *journal, tid_t tid);
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
index 20f1e3ff6013..2da63fd7b98f 100644
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -13,8 +13,16 @@ struct mb_cache;
 struct mb_cache_entry {
 	/* List of entries in cache - protected by cache->c_list_lock */
 	struct list_head	e_list;
-	/* Hash table list - protected by hash chain bitlock */
+	/*
+	 * Hash table list - protected by hash chain bitlock. The entry is
+	 * guaranteed to be hashed while e_refcnt > 0.
+	 */
 	struct hlist_bl_node	e_hash_list;
+	/*
+	 * Entry refcount. Once it reaches zero, entry is unhashed and freed.
+	 * While refcount > 0, the entry is guaranteed to stay in the hash and
+	 * e.g. mb_cache_entry_try_delete() will fail.
+	 */
 	atomic_t		e_refcnt;
 	/* Key in hash - stable during lifetime of the entry */
 	u32			e_key;
@@ -29,17 +37,24 @@ void mb_cache_destroy(struct mb_cache *cache);
 
 int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
 			  u64 value, bool reusable);
-void __mb_cache_entry_free(struct mb_cache_entry *entry);
-static inline int mb_cache_entry_put(struct mb_cache *cache,
-				     struct mb_cache_entry *entry)
+void __mb_cache_entry_free(struct mb_cache *cache,
+			   struct mb_cache_entry *entry);
+void mb_cache_entry_wait_unused(struct mb_cache_entry *entry);
+static inline void mb_cache_entry_put(struct mb_cache *cache,
+				      struct mb_cache_entry *entry)
 {
-	if (!atomic_dec_and_test(&entry->e_refcnt))
-		return 0;
-	__mb_cache_entry_free(entry);
-	return 1;
+	unsigned int cnt = atomic_dec_return(&entry->e_refcnt);
+
+	if (cnt > 0) {
+		if (cnt <= 2)
+			wake_up_var(&entry->e_refcnt);
+		return;
+	}
+	__mb_cache_entry_free(cache, entry);
 }
 
-void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value);
+struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
+						    u32 key, u64 value);
 struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
 					  u64 value);
 struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ecda6e63d5f2..96b16fbe1aa4 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -733,10 +733,10 @@ enum {
 };
 
 enum {
-	MR_CACHE_LAST_STD_ENTRY = 20,
+	MKEY_CACHE_LAST_STD_ENTRY = 20,
 	MLX5_IMR_MTT_CACHE_ENTRY,
 	MLX5_IMR_KSM_CACHE_ENTRY,
-	MAX_MR_CACHE_ENTRIES
+	MAX_MKEY_CACHE_ENTRIES
 };
 
 struct mlx5_profile {
@@ -745,7 +745,7 @@ struct mlx5_profile {
 	struct {
 		int	size;
 		int	limit;
-	} mr_cache[MAX_MR_CACHE_ENTRIES];
+	} mr_cache[MAX_MKEY_CACHE_ENTRIES];
 };
 
 struct mlx5_hca_cap {
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index ece3e35622d7..8e73c377da2c 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -178,6 +178,7 @@ struct mlx5_flow_table_attr {
 	int max_fte;
 	u32 level;
 	u32 flags;
+	u16 uid;
 	struct mlx5_flow_table *next_ft;
 
 	struct {
@@ -315,4 +316,5 @@ struct mlx5_pkt_reformat *mlx5_packet_reformat_alloc(struct mlx5_core_dev *dev,
 void mlx5_packet_reformat_dealloc(struct mlx5_core_dev *dev,
 				  struct mlx5_pkt_reformat *reformat);
 
+u32 mlx5_flow_table_id(struct mlx5_flow_table *ft);
 #endif
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 51b4e71017ee..4acd5610e96b 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1371,7 +1371,9 @@ enum {
 };
 
 struct mlx5_ifc_cmd_hca_cap_bits {
-	u8         reserved_at_0[0x1f];
+	u8         reserved_at_0[0x10];
+	u8         shared_object_to_user_object_allowed[0x1];
+	u8         reserved_at_13[0xe];
 	u8         vhca_resource_manager[0x1];
 
 	u8         hca_cap_2[0x1];
@@ -8528,7 +8530,7 @@ struct mlx5_ifc_create_flow_table_out_bits {
 
 struct mlx5_ifc_create_flow_table_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h
new file mode 100644
index 000000000000..dcb8030062dd
--- /dev/null
+++ b/include/linux/nvme-auth.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021 Hannes Reinecke, SUSE Software Solutions
+ */
+
+#ifndef _NVME_AUTH_H
+#define _NVME_AUTH_H
+
+#include <crypto/kpp.h>
+
+struct nvme_dhchap_key {
+	u8 *key;
+	size_t len;
+	u8 hash;
+};
+
+u32 nvme_auth_get_seqnum(void);
+const char *nvme_auth_dhgroup_name(u8 dhgroup_id);
+const char *nvme_auth_dhgroup_kpp(u8 dhgroup_id);
+u8 nvme_auth_dhgroup_id(const char *dhgroup_name);
+
+const char *nvme_auth_hmac_name(u8 hmac_id);
+const char *nvme_auth_digest_name(u8 hmac_id);
+size_t nvme_auth_hmac_hash_len(u8 hmac_id);
+u8 nvme_auth_hmac_id(const char *hmac_name);
+
+struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
+					      u8 key_hash);
+void nvme_auth_free_key(struct nvme_dhchap_key *key);
+u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn);
+int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key);
+int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len,
+				  u8 *challenge, u8 *aug, size_t hlen);
+int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid);
+int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm,
+			 u8 *host_key, size_t host_key_len);
+int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm,
+				u8 *ctrl_key, size_t ctrl_key_len,
+				u8 *sess_key, size_t sess_key_len);
+
+#endif /* _NVME_AUTH_H */
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 07cfc922f8e4..ae53d74f3696 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -19,6 +19,7 @@
 #define NVMF_TRSVCID_SIZE	32
 #define NVMF_TRADDR_SIZE	256
 #define NVMF_TSAS_SIZE		256
+#define NVMF_AUTH_HASH_LEN	64
 
 #define NVME_DISC_SUBSYS_NAME	"nqn.2014-08.org.nvmexpress.discovery"
 
@@ -711,6 +712,10 @@ enum {
 	NVME_AER_VS			= 7,
 };
 
+enum {
+	NVME_AER_ERROR_PERSIST_INT_ERR	= 0x03,
+};
+
 enum {
 	NVME_AER_NOTICE_NS_CHANGED	= 0x00,
 	NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
@@ -1369,6 +1374,8 @@ enum nvmf_capsule_command {
 	nvme_fabrics_type_property_set	= 0x00,
 	nvme_fabrics_type_connect	= 0x01,
 	nvme_fabrics_type_property_get	= 0x04,
+	nvme_fabrics_type_auth_send	= 0x05,
+	nvme_fabrics_type_auth_receive	= 0x06,
 };
 
 #define nvme_fabrics_type_name(type)   { type, #type }
@@ -1376,7 +1383,9 @@ enum nvmf_capsule_command {
 	__print_symbolic(type,						\
 		nvme_fabrics_type_name(nvme_fabrics_type_property_set),	\
 		nvme_fabrics_type_name(nvme_fabrics_type_connect),	\
-		nvme_fabrics_type_name(nvme_fabrics_type_property_get))
+		nvme_fabrics_type_name(nvme_fabrics_type_property_get), \
+		nvme_fabrics_type_name(nvme_fabrics_type_auth_send),	\
+		nvme_fabrics_type_name(nvme_fabrics_type_auth_receive))
 
 /*
  * If not fabrics command, fctype will be ignored.
@@ -1472,6 +1481,11 @@ struct nvmf_connect_command {
 	__u8		resv4[12];
 };
 
+enum {
+	NVME_CONNECT_AUTHREQ_ASCR	= (1 << 2),
+	NVME_CONNECT_AUTHREQ_ATR	= (1 << 1),
+};
+
 struct nvmf_connect_data {
 	uuid_t		hostid;
 	__le16		cntlid;
@@ -1506,6 +1520,200 @@ struct nvmf_property_get_command {
 	__u8		resv4[16];
 };
 
+struct nvmf_auth_common_command {
+	__u8		opcode;
+	__u8		resv1;
+	__u16		command_id;
+	__u8		fctype;
+	__u8		resv2[19];
+	union nvme_data_ptr dptr;
+	__u8		resv3;
+	__u8		spsp0;
+	__u8		spsp1;
+	__u8		secp;
+	__le32		al_tl;
+	__u8		resv4[16];
+};
+
+struct nvmf_auth_send_command {
+	__u8		opcode;
+	__u8		resv1;
+	__u16		command_id;
+	__u8		fctype;
+	__u8		resv2[19];
+	union nvme_data_ptr dptr;
+	__u8		resv3;
+	__u8		spsp0;
+	__u8		spsp1;
+	__u8		secp;
+	__le32		tl;
+	__u8		resv4[16];
+};
+
+struct nvmf_auth_receive_command {
+	__u8		opcode;
+	__u8		resv1;
+	__u16		command_id;
+	__u8		fctype;
+	__u8		resv2[19];
+	union nvme_data_ptr dptr;
+	__u8		resv3;
+	__u8		spsp0;
+	__u8		spsp1;
+	__u8		secp;
+	__le32		al;
+	__u8		resv4[16];
+};
+
+/* Value for secp */
+enum {
+	NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER	= 0xe9,
+};
+
+/* Defined value for auth_type */
+enum {
+	NVME_AUTH_COMMON_MESSAGES	= 0x00,
+	NVME_AUTH_DHCHAP_MESSAGES	= 0x01,
+};
+
+/* Defined messages for auth_id */
+enum {
+	NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE	= 0x00,
+	NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE	= 0x01,
+	NVME_AUTH_DHCHAP_MESSAGE_REPLY		= 0x02,
+	NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1	= 0x03,
+	NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2	= 0x04,
+	NVME_AUTH_DHCHAP_MESSAGE_FAILURE2	= 0xf0,
+	NVME_AUTH_DHCHAP_MESSAGE_FAILURE1	= 0xf1,
+};
+
+struct nvmf_auth_dhchap_protocol_descriptor {
+	__u8		authid;
+	__u8		rsvd;
+	__u8		halen;
+	__u8		dhlen;
+	__u8		idlist[60];
+};
+
+enum {
+	NVME_AUTH_DHCHAP_AUTH_ID	= 0x01,
+};
+
+/* Defined hash functions for DH-HMAC-CHAP authentication */
+enum {
+	NVME_AUTH_HASH_SHA256	= 0x01,
+	NVME_AUTH_HASH_SHA384	= 0x02,
+	NVME_AUTH_HASH_SHA512	= 0x03,
+	NVME_AUTH_HASH_INVALID	= 0xff,
+};
+
+/* Defined Diffie-Hellman group identifiers for DH-HMAC-CHAP authentication */
+enum {
+	NVME_AUTH_DHGROUP_NULL		= 0x00,
+	NVME_AUTH_DHGROUP_2048		= 0x01,
+	NVME_AUTH_DHGROUP_3072		= 0x02,
+	NVME_AUTH_DHGROUP_4096		= 0x03,
+	NVME_AUTH_DHGROUP_6144		= 0x04,
+	NVME_AUTH_DHGROUP_8192		= 0x05,
+	NVME_AUTH_DHGROUP_INVALID	= 0xff,
+};
+
+union nvmf_auth_protocol {
+	struct nvmf_auth_dhchap_protocol_descriptor dhchap;
+};
+
+struct nvmf_auth_dhchap_negotiate_data {
+	__u8		auth_type;
+	__u8		auth_id;
+	__le16		rsvd;
+	__le16		t_id;
+	__u8		sc_c;
+	__u8		napd;
+	union nvmf_auth_protocol auth_protocol[];
+};
+
+struct nvmf_auth_dhchap_challenge_data {
+	__u8		auth_type;
+	__u8		auth_id;
+	__u16		rsvd1;
+	__le16		t_id;
+	__u8		hl;
+	__u8		rsvd2;
+	__u8		hashid;
+	__u8		dhgid;
+	__le16		dhvlen;
+	__le32		seqnum;
+	/* 'hl' bytes of challenge value */
+	__u8		cval[];
+	/* followed by 'dhvlen' bytes of DH value */
+};
+
+struct nvmf_auth_dhchap_reply_data {
+	__u8		auth_type;
+	__u8		auth_id;
+	__le16		rsvd1;
+	__le16		t_id;
+	__u8		hl;
+	__u8		rsvd2;
+	__u8		cvalid;
+	__u8		rsvd3;
+	__le16		dhvlen;
+	__le32		seqnum;
+	/* 'hl' bytes of response data */
+	__u8		rval[];
+	/* followed by 'hl' bytes of Challenge value */
+	/* followed by 'dhvlen' bytes of DH value */
+};
+
+enum {
+	NVME_AUTH_DHCHAP_RESPONSE_VALID	= (1 << 0),
+};
+
+struct nvmf_auth_dhchap_success1_data {
+	__u8		auth_type;
+	__u8		auth_id;
+	__le16		rsvd1;
+	__le16		t_id;
+	__u8		hl;
+	__u8		rsvd2;
+	__u8		rvalid;
+	__u8		rsvd3[7];
+	/* 'hl' bytes of response value if 'rvalid' is set */
+	__u8		rval[];
+};
+
+struct nvmf_auth_dhchap_success2_data {
+	__u8		auth_type;
+	__u8		auth_id;
+	__le16		rsvd1;
+	__le16		t_id;
+	__u8		rsvd2[10];
+};
+
+struct nvmf_auth_dhchap_failure_data {
+	__u8		auth_type;
+	__u8		auth_id;
+	__le16		rsvd1;
+	__le16		t_id;
+	__u8		rescode;
+	__u8		rescode_exp;
+};
+
+enum {
+	NVME_AUTH_DHCHAP_FAILURE_REASON_FAILED	= 0x01,
+};
+
+enum {
+	NVME_AUTH_DHCHAP_FAILURE_FAILED			= 0x01,
+	NVME_AUTH_DHCHAP_FAILURE_NOT_USABLE		= 0x02,
+	NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH	= 0x03,
+	NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE		= 0x04,
+	NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE	= 0x05,
+	NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD	= 0x06,
+	NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE	= 0x07,
+};
+
+
 struct nvme_dbbuf {
 	__u8			opcode;
 	__u8			flags;
@@ -1549,6 +1757,9 @@ struct nvme_command {
 		struct nvmf_connect_command connect;
 		struct nvmf_property_set_command prop_set;
 		struct nvmf_property_get_command prop_get;
+		struct nvmf_auth_common_command auth_common;
+		struct nvmf_auth_send_command auth_send;
+		struct nvmf_auth_receive_command auth_receive;
 		struct nvme_dbbuf dbbuf;
 		struct nvme_directive_cmd directive;
 	};
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9c6317cf80d5..7c2f76f34f6f 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -4603,7 +4603,7 @@ static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev,
 
 /**
  * ib_lid_cpu16 - Return lid in 16bit CPU encoding.
- *     In the current implementation the only way to get
+ *     In the current implementation the only way to
  *     get the 32bit lid is from other sources for OPA.
  *     For IB, lids will always be 16bits so cast the
  *     value accordingly.
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
index d989f030fae0..5b18e2e36ee6 100644
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -108,6 +108,7 @@ struct rdma_cm_id {
 	enum rdma_ucm_port_space ps;
 	enum ib_qp_type		 qp_type;
 	u32			 port_num;
+	struct work_struct net_work;
 };
 
 struct rdma_cm_id *
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index ca33092354ab..677edaab2b66 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -15,6 +15,8 @@
 #define	UBLK_CMD_DEL_DEV		0x05
 #define	UBLK_CMD_START_DEV	0x06
 #define	UBLK_CMD_STOP_DEV	0x07
+#define	UBLK_CMD_SET_PARAMS	0x08
+#define	UBLK_CMD_GET_PARAMS	0x09
 
 /*
  * IO commands, issued by ublk server, and handled by ublk driver.
@@ -28,12 +30,21 @@
  *      this IO request, request's handling result is committed to ublk
  *      driver, meantime FETCH_REQ is piggyback, and FETCH_REQ has to be
  *      handled before completing io request.
+ *
+ * NEED_GET_DATA: only used for write requests to set io addr and copy data
+ *      When NEED_GET_DATA is set, ublksrv has to issue UBLK_IO_NEED_GET_DATA
+ *      command after ublk driver returns UBLK_IO_RES_NEED_GET_DATA.
+ *
+ *      It is only used if ublksrv set UBLK_F_NEED_GET_DATA flag
+ *      while starting a ublk device.
  */
 #define	UBLK_IO_FETCH_REQ		0x20
 #define	UBLK_IO_COMMIT_AND_FETCH_REQ	0x21
+#define	UBLK_IO_NEED_GET_DATA	0x22
 
 /* only ABORT means that no re-fetch */
 #define UBLK_IO_RES_OK			0
+#define UBLK_IO_RES_NEED_GET_DATA	1
 #define UBLK_IO_RES_ABORT		(-ENODEV)
 
 #define UBLKSRV_CMD_BUF_OFFSET	0
@@ -54,6 +65,15 @@
  */
 #define UBLK_F_URING_CMD_COMP_IN_TASK	(1ULL << 1)
 
+/*
+ * User should issue io cmd again for write requests to
+ * set io buffer address and copy data from bio vectors
+ * to the userspace io buffer.
+ *
+ * In this mode, task_work is not used.
+ */
+#define UBLK_F_NEED_GET_DATA (1UL << 2)
+
 /* device state */
 #define UBLK_S_DEV_DEAD	0
 #define UBLK_S_DEV_LIVE	1
@@ -78,22 +98,23 @@ struct ublksrv_ctrl_cmd {
 struct ublksrv_ctrl_dev_info {
 	__u16	nr_hw_queues;
 	__u16	queue_depth;
-	__u16	block_size;
 	__u16	state;
+	__u16	pad0;
 
-	__u32	rq_max_blocks;
+	__u32	max_io_buf_bytes;
 	__u32	dev_id;
 
-	__u64   dev_blocks;
-
 	__s32	ublksrv_pid;
-	__s32	reserved0;
+	__u32	pad1;
+
 	__u64	flags;
-	__u64	flags_reserved;
 
 	/* For ublksrv internal use, invisible to ublk driver */
 	__u64	ublksrv_flags;
-	__u64	reserved1[9];
+
+	__u64	reserved0;
+	__u64	reserved1;
+	__u64   reserved2;
 };
 
 #define		UBLK_IO_OP_READ		0
@@ -158,4 +179,49 @@ struct ublksrv_io_cmd {
 	__u64	addr;
 };
 
+struct ublk_param_basic {
+#define UBLK_ATTR_READ_ONLY            (1 << 0)
+#define UBLK_ATTR_ROTATIONAL           (1 << 1)
+#define UBLK_ATTR_VOLATILE_CACHE       (1 << 2)
+#define UBLK_ATTR_FUA                  (1 << 3)
+	__u32	attrs;
+	__u8	logical_bs_shift;
+	__u8	physical_bs_shift;
+	__u8	io_opt_shift;
+	__u8	io_min_shift;
+
+	__u32	max_sectors;
+	__u32	chunk_sectors;
+
+	__u64   dev_sectors;
+	__u64   virt_boundary_mask;
+};
+
+struct ublk_param_discard {
+	__u32	discard_alignment;
+
+	__u32	discard_granularity;
+	__u32	max_discard_sectors;
+
+	__u32	max_write_zeroes_sectors;
+	__u16	max_discard_segments;
+	__u16	reserved0;
+};
+
+struct ublk_params {
+	/*
+	 * Total length of parameters, userspace has to set 'len' for both
+	 * SET_PARAMS and GET_PARAMS command, and driver may update len
+	 * if two sides use different version of 'ublk_params', same with
+	 * 'types' fields.
+	 */
+	__u32	len;
+#define UBLK_PARAM_TYPE_BASIC           (1 << 0)
+#define UBLK_PARAM_TYPE_DISCARD         (1 << 1)
+	__u32	types;			/* types of parameter included */
+
+	struct ublk_param_basic		basic;
+	struct ublk_param_discard	discard;
+};
+
 #endif
diff --git a/include/uapi/rdma/erdma-abi.h b/include/uapi/rdma/erdma-abi.h
new file mode 100644
index 000000000000..b7a0222f978f
--- /dev/null
+++ b/include/uapi/rdma/erdma-abi.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/*
+ * Copyright (c) 2020-2022, Alibaba Group.
+ */
+
+#ifndef __ERDMA_USER_H__
+#define __ERDMA_USER_H__
+
+#include <linux/types.h>
+
+#define ERDMA_ABI_VERSION       1
+
+struct erdma_ureq_create_cq {
+	__aligned_u64 db_record_va;
+	__aligned_u64 qbuf_va;
+	__u32 qbuf_len;
+	__u32 rsvd0;
+};
+
+struct erdma_uresp_create_cq {
+	__u32 cq_id;
+	__u32 num_cqe;
+};
+
+struct erdma_ureq_create_qp {
+	__aligned_u64 db_record_va;
+	__aligned_u64 qbuf_va;
+	__u32 qbuf_len;
+	__u32 rsvd0;
+};
+
+struct erdma_uresp_create_qp {
+	__u32 qp_id;
+	__u32 num_sqe;
+	__u32 num_rqe;
+	__u32 rq_offset;
+};
+
+struct erdma_uresp_alloc_ctx {
+	__u32 dev_id;
+	__u32 pad;
+	__u32 sdb_type;
+	__u32 sdb_offset;
+	__aligned_u64 sdb;
+	__aligned_u64 rdb;
+	__aligned_u64 cdb;
+};
+
+#endif
diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h
index 3072e5d6b692..7dd56210226f 100644
--- a/include/uapi/rdma/ib_user_ioctl_verbs.h
+++ b/include/uapi/rdma/ib_user_ioctl_verbs.h
@@ -250,6 +250,7 @@ enum rdma_driver_id {
 	RDMA_DRIVER_QIB,
 	RDMA_DRIVER_EFA,
 	RDMA_DRIVER_SIW,
+	RDMA_DRIVER_ERDMA,
 };
 
 enum ib_uverbs_gid_type {
diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
index e539c84d63f1..3bee490eb585 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
@@ -228,6 +228,7 @@ enum mlx5_ib_objects {
 	MLX5_IB_OBJECT_VAR,
 	MLX5_IB_OBJECT_PP,
 	MLX5_IB_OBJECT_UAR,
+	MLX5_IB_OBJECT_STEERING_ANCHOR,
 };
 
 enum mlx5_ib_flow_matcher_create_attrs {
@@ -248,6 +249,22 @@ enum mlx5_ib_flow_matcher_methods {
 	MLX5_IB_METHOD_FLOW_MATCHER_DESTROY,
 };
 
+enum mlx5_ib_flow_steering_anchor_create_attrs {
+	MLX5_IB_ATTR_STEERING_ANCHOR_CREATE_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+	MLX5_IB_ATTR_STEERING_ANCHOR_FT_TYPE,
+	MLX5_IB_ATTR_STEERING_ANCHOR_PRIORITY,
+	MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID,
+};
+
+enum mlx5_ib_flow_steering_anchor_destroy_attrs {
+	MLX5_IB_ATTR_STEERING_ANCHOR_DESTROY_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+};
+
+enum mlx5_ib_steering_anchor_methods {
+	MLX5_IB_METHOD_STEERING_ANCHOR_CREATE = (1U << UVERBS_ID_NS_SHIFT),
+	MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY,
+};
+
 enum mlx5_ib_device_query_context_attrs {
 	MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX = (1U << UVERBS_ID_NS_SHIFT),
 };
diff --git a/lib/Makefile b/lib/Makefile
index 67482f5ec0e8..441795bf5d33 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -46,7 +46,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
 	 bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
 	 list_sort.o uuid.o iov_iter.o clz_ctz.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
-	 percpu-refcount.o rhashtable.o \
+	 percpu-refcount.o rhashtable.o base64.o \
 	 once.o refcount.o usercopy.o errseq.o bucket_locks.o \
 	 generic-radix-tree.o
 obj-$(CONFIG_STRING_SELFTEST) += test_string.o
diff --git a/lib/base64.c b/lib/base64.c
new file mode 100644
index 000000000000..b736a7a431c5
--- /dev/null
+++ b/lib/base64.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * base64.c - RFC4648-compliant base64 encoding
+ *
+ * Copyright (c) 2020 Hannes Reinecke, SUSE
+ *
+ * Based on the base64url routines from fs/crypto/fname.c
+ * (which are using the URL-safe base64 encoding),
+ * modified to use the standard coding table from RFC4648 section 4.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/base64.h>
+
+static const char base64_table[65] =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+/**
+ * base64_encode() - base64-encode some binary data
+ * @src: the binary data to encode
+ * @srclen: the length of @src in bytes
+ * @dst: (output) the base64-encoded string.  Not NUL-terminated.
+ *
+ * Encodes data using base64 encoding, i.e. the "Base 64 Encoding" specified
+ * by RFC 4648, including the  '='-padding.
+ *
+ * Return: the length of the resulting base64-encoded string in bytes.
+ */
+int base64_encode(const u8 *src, int srclen, char *dst)
+{
+	u32 ac = 0;
+	int bits = 0;
+	int i;
+	char *cp = dst;
+
+	for (i = 0; i < srclen; i++) {
+		ac = (ac << 8) | src[i];
+		bits += 8;
+		do {
+			bits -= 6;
+			*cp++ = base64_table[(ac >> bits) & 0x3f];
+		} while (bits >= 6);
+	}
+	if (bits) {
+		*cp++ = base64_table[(ac << (6 - bits)) & 0x3f];
+		bits -= 6;
+	}
+	while (bits < 0) {
+		*cp++ = '=';
+		bits += 2;
+	}
+	return cp - dst;
+}
+EXPORT_SYMBOL_GPL(base64_encode);
+
+/**
+ * base64_decode() - base64-decode a string
+ * @src: the string to decode.  Doesn't need to be NUL-terminated.
+ * @srclen: the length of @src in bytes
+ * @dst: (output) the decoded binary data
+ *
+ * Decodes a string using base64 encoding, i.e. the "Base 64 Encoding"
+ * specified by RFC 4648, including the  '='-padding.
+ *
+ * This implementation hasn't been optimized for performance.
+ *
+ * Return: the length of the resulting decoded binary data in bytes,
+ *	   or -1 if the string isn't a valid base64 string.
+ */
+int base64_decode(const char *src, int srclen, u8 *dst)
+{
+	u32 ac = 0;
+	int bits = 0;
+	int i;
+	u8 *bp = dst;
+
+	for (i = 0; i < srclen; i++) {
+		const char *p = strchr(base64_table, src[i]);
+
+		if (src[i] == '=') {
+			ac = (ac << 6);
+			bits += 6;
+			if (bits >= 8)
+				bits -= 8;
+			continue;
+		}
+		if (p == NULL || src[i] == 0)
+			return -1;
+		ac = (ac << 6) | (p - base64_table);
+		bits += 6;
+		if (bits >= 8) {
+			bits -= 8;
+			*bp++ = (u8)(ac >> bits);
+		}
+	}
+	if (ac & ((1 << bits) - 1))
+		return -1;
+	return bp - dst;
+}
+EXPORT_SYMBOL_GPL(base64_decode);
diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile
index 3822f4ea5f49..1bea2d16d4c1 100644
--- a/tools/tracing/rtla/Makefile
+++ b/tools/tracing/rtla/Makefile
@@ -1,6 +1,6 @@
 NAME	:=	rtla
 # Follow the kernel version
-VERSION :=	$(shell cat VERSION 2> /dev/null || make -sC ../../.. kernelversion)
+VERSION :=	$(shell cat VERSION 2> /dev/null || make -sC ../../.. kernelversion | grep -v make)
 
 # From libtracefs:
 # Makefiles suck: This macro sets a default value of $(2) for the
diff --git a/tools/tracing/rtla/src/trace.c b/tools/tracing/rtla/src/trace.c
index 5784c9f9e570..e1ba6d9f4265 100644
--- a/tools/tracing/rtla/src/trace.c
+++ b/tools/tracing/rtla/src/trace.c
@@ -134,13 +134,18 @@ void trace_instance_destroy(struct trace_instance *trace)
 	if (trace->inst) {
 		disable_tracer(trace->inst);
 		destroy_instance(trace->inst);
+		trace->inst = NULL;
 	}
 
-	if (trace->seq)
+	if (trace->seq) {
 		free(trace->seq);
+		trace->seq = NULL;
+	}
 
-	if (trace->tep)
+	if (trace->tep) {
 		tep_free(trace->tep);
+		trace->tep = NULL;
+	}
 }
 
 /*
diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c
index 5352167a1e75..663a047f794d 100644
--- a/tools/tracing/rtla/src/utils.c
+++ b/tools/tracing/rtla/src/utils.c
@@ -106,8 +106,9 @@ int parse_cpu_list(char *cpu_list, char **monitored_cpus)
 
 	nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
 
-	mon_cpus = malloc(nr_cpus * sizeof(char));
-	memset(mon_cpus, 0, (nr_cpus * sizeof(char)));
+	mon_cpus = calloc(nr_cpus, sizeof(char));
+	if (!mon_cpus)
+		goto err;
 
 	for (p = cpu_list; *p; ) {
 		cpu = atoi(p);
@@ -224,7 +225,7 @@ long parse_ns_duration(char *val)
 #elif __arm__
 # define __NR_sched_setattr	380
 # define __NR_sched_getattr	381
-#elif __aarch64__
+#elif __aarch64__ || __riscv
 # define __NR_sched_setattr	274
 # define __NR_sched_getattr	275
 #elif __powerpc__