mirror of
https://github.com/hardkernel/linux.git
synced 2026-06-06 02:50:49 +09:00
Merge 29b1d469f3 ("Merge tag 'trace-rtla-v5.20' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace") into android-mainline
Steps on the way to 6.0-rc1 Signed-off-by: Greg Kroah-Hartman <gregkh@google.com> Change-Id: Ia54fa6f7ff14e0c9cf434ff4b9c244b4cf2d9b6c
This commit is contained in:
@@ -72,6 +72,28 @@ submit_queues=[1..nr_cpus]: Default: 1
|
||||
hw_queue_depth=[0..qdepth]: Default: 64
|
||||
The hardware queue depth of the device.
|
||||
|
||||
memory_backed=[0/1]: Default: 0
|
||||
Whether or not to use a memory buffer to respond to IO requests
|
||||
|
||||
= =============================================
|
||||
0 Transfer no data in response to IO requests
|
||||
1 Use a memory buffer to respond to IO requests
|
||||
= =============================================
|
||||
|
||||
discard=[0/1]: Default: 0
|
||||
Support discard operations (requires memory-backed null_blk device).
|
||||
|
||||
= =====================================
|
||||
0 Do not support discard operations
|
||||
1 Enable support for discard operations
|
||||
= =====================================
|
||||
|
||||
cache_size=[Size in MB]: Default: 0
|
||||
Cache size in MB for memory-backed device.
|
||||
|
||||
mbps=[Maximum bandwidth in MB/s]: Default: 0 (no limit)
|
||||
Bandwidth limit for device performance.
|
||||
|
||||
Multi-queue specific parameters
|
||||
-------------------------------
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| i.i_block Offset | Where It Points |
|
||||
| i.i_block Offset | Where It Points |
|
||||
+=====================+==============================================================================================================================================================================================================================+
|
||||
| 0 to 11 | Direct map to file blocks 0 to 11. |
|
||||
+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
|
||||
@@ -1,29 +1,314 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==========================
|
||||
XFS Delayed Logging Design
|
||||
==========================
|
||||
==================
|
||||
XFS Logging Design
|
||||
==================
|
||||
|
||||
Introduction to Re-logging in XFS
|
||||
=================================
|
||||
Preamble
|
||||
========
|
||||
|
||||
XFS logging is a combination of logical and physical logging. Some objects,
|
||||
such as inodes and dquots, are logged in logical format where the details
|
||||
logged are made up of the changes to in-core structures rather than on-disk
|
||||
structures. Other objects - typically buffers - have their physical changes
|
||||
logged. The reason for these differences is to reduce the amount of log space
|
||||
required for objects that are frequently logged. Some parts of inodes are more
|
||||
frequently logged than others, and inodes are typically more frequently logged
|
||||
than any other object (except maybe the superblock buffer) so keeping the
|
||||
amount of metadata logged low is of prime importance.
|
||||
This document describes the design and algorithms that the XFS journalling
|
||||
subsystem is based on. This document describes the design and algorithms that
|
||||
the XFS journalling subsystem is based on so that readers may familiarize
|
||||
themselves with the general concepts of how transaction processing in XFS works.
|
||||
|
||||
The reason that this is such a concern is that XFS allows multiple separate
|
||||
modifications to a single object to be carried in the log at any given time.
|
||||
This allows the log to avoid needing to flush each change to disk before
|
||||
recording a new change to the object. XFS does this via a method called
|
||||
"re-logging". Conceptually, this is quite simple - all it requires is that any
|
||||
new change to the object is recorded with a *new copy* of all the existing
|
||||
changes in the new transaction that is written to the log.
|
||||
We begin with an overview of transactions in XFS, followed by describing how
|
||||
transaction reservations are structured and accounted, and then move into how we
|
||||
guarantee forwards progress for long running transactions with finite initial
|
||||
reservations bounds. At this point we need to explain how relogging works. With
|
||||
the basic concepts covered, the design of the delayed logging mechanism is
|
||||
documented.
|
||||
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
XFS uses Write Ahead Logging for ensuring changes to the filesystem metadata
|
||||
are atomic and recoverable. For reasons of space and time efficiency, the
|
||||
logging mechanisms are varied and complex, combining intents, logical and
|
||||
physical logging mechanisms to provide the necessary recovery guarantees the
|
||||
filesystem requires.
|
||||
|
||||
Some objects, such as inodes and dquots, are logged in logical format where the
|
||||
details logged are made up of the changes to in-core structures rather than
|
||||
on-disk structures. Other objects - typically buffers - have their physical
|
||||
changes logged. Long running atomic modifications have individual changes
|
||||
chained together by intents, ensuring that journal recovery can restart and
|
||||
finish an operation that was only partially done when the system stopped
|
||||
functioning.
|
||||
|
||||
The reason for these differences is to keep the amount of log space and CPU time
|
||||
required to process objects being modified as small as possible and hence the
|
||||
logging overhead as low as possible. Some items are very frequently modified,
|
||||
and some parts of objects are more frequently modified than others, so keeping
|
||||
the overhead of metadata logging low is of prime importance.
|
||||
|
||||
The method used to log an item or chain modifications together isn't
|
||||
particularly important in the scope of this document. It suffices to know that
|
||||
the method used for logging a particular object or chaining modifications
|
||||
together are different and are dependent on the object and/or modification being
|
||||
performed. The logging subsystem only cares that certain specific rules are
|
||||
followed to guarantee forwards progress and prevent deadlocks.
|
||||
|
||||
|
||||
Transactions in XFS
|
||||
===================
|
||||
|
||||
XFS has two types of high level transactions, defined by the type of log space
|
||||
reservation they take. These are known as "one shot" and "permanent"
|
||||
transactions. Permanent transaction reservations can take reservations that span
|
||||
commit boundaries, whilst "one shot" transactions are for a single atomic
|
||||
modification.
|
||||
|
||||
The type and size of reservation must be matched to the modification taking
|
||||
place. This means that permanent transactions can be used for one-shot
|
||||
modifications, but one-shot reservations cannot be used for permanent
|
||||
transactions.
|
||||
|
||||
In the code, a one-shot transaction pattern looks somewhat like this::
|
||||
|
||||
tp = xfs_trans_alloc(<reservation>)
|
||||
<lock items>
|
||||
<join item to transaction>
|
||||
<do modification>
|
||||
xfs_trans_commit(tp);
|
||||
|
||||
As items are modified in the transaction, the dirty regions in those items are
|
||||
tracked via the transaction handle. Once the transaction is committed, all
|
||||
resources joined to it are released, along with the remaining unused reservation
|
||||
space that was taken at the transaction allocation time.
|
||||
|
||||
In contrast, a permanent transaction is made up of multiple linked individual
|
||||
transactions, and the pattern looks like this::
|
||||
|
||||
tp = xfs_trans_alloc(<reservation>)
|
||||
xfs_ilock(ip, XFS_ILOCK_EXCL)
|
||||
|
||||
loop {
|
||||
xfs_trans_ijoin(tp, 0);
|
||||
<do modification>
|
||||
xfs_trans_log_inode(tp, ip);
|
||||
xfs_trans_roll(&tp);
|
||||
}
|
||||
|
||||
xfs_trans_commit(tp);
|
||||
xfs_iunlock(ip, XFS_ILOCK_EXCL);
|
||||
|
||||
While this might look similar to a one-shot transaction, there is an important
|
||||
difference: xfs_trans_roll() performs a specific operation that links two
|
||||
transactions together::
|
||||
|
||||
ntp = xfs_trans_dup(tp);
|
||||
xfs_trans_commit(tp);
|
||||
xfs_log_reserve(ntp);
|
||||
|
||||
This results in a series of "rolling transactions" where the inode is locked
|
||||
across the entire chain of transactions. Hence while this series of rolling
|
||||
transactions is running, nothing else can read from or write to the inode and
|
||||
this provides a mechanism for complex changes to appear atomic from an external
|
||||
observer's point of view.
|
||||
|
||||
It is important to note that a series of rolling transactions in a permanent
|
||||
transaction does not form an atomic change in the journal. While each
|
||||
individual modification is atomic, the chain is *not atomic*. If we crash half
|
||||
way through, then recovery will only replay up to the last transactional
|
||||
modification the loop made that was committed to the journal.
|
||||
|
||||
This affects long running permanent transactions in that it is not possible to
|
||||
predict how much of a long running operation will actually be recovered because
|
||||
there is no guarantee of how much of the operation reached stale storage. Hence
|
||||
if a long running operation requires multiple transactions to fully complete,
|
||||
the high level operation must use intents and deferred operations to guarantee
|
||||
recovery can complete the operation once the first transactions is persisted in
|
||||
the on-disk journal.
|
||||
|
||||
|
||||
Transactions are Asynchronous
|
||||
=============================
|
||||
|
||||
In XFS, all high level transactions are asynchronous by default. This means that
|
||||
xfs_trans_commit() does not guarantee that the modification has been committed
|
||||
to stable storage when it returns. Hence when a system crashes, not all the
|
||||
completed transactions will be replayed during recovery.
|
||||
|
||||
However, the logging subsystem does provide global ordering guarantees, such
|
||||
that if a specific change is seen after recovery, all metadata modifications
|
||||
that were committed prior to that change will also be seen.
|
||||
|
||||
For single shot operations that need to reach stable storage immediately, or
|
||||
ensuring that a long running permanent transaction is fully committed once it is
|
||||
complete, we can explicitly tag a transaction as synchronous. This will trigger
|
||||
a "log force" to flush the outstanding committed transactions to stable storage
|
||||
in the journal and wait for that to complete.
|
||||
|
||||
Synchronous transactions are rarely used, however, because they limit logging
|
||||
throughput to the IO latency limitations of the underlying storage. Instead, we
|
||||
tend to use log forces to ensure modifications are on stable storage only when
|
||||
a user operation requires a synchronisation point to occur (e.g. fsync).
|
||||
|
||||
|
||||
Transaction Reservations
|
||||
========================
|
||||
|
||||
It has been mentioned a number of times now that the logging subsystem needs to
|
||||
provide a forwards progress guarantee so that no modification ever stalls
|
||||
because it can't be written to the journal due to a lack of space in the
|
||||
journal. This is achieved by the transaction reservations that are made when
|
||||
a transaction is first allocated. For permanent transactions, these reservations
|
||||
are maintained as part of the transaction rolling mechanism.
|
||||
|
||||
A transaction reservation provides a guarantee that there is physical log space
|
||||
available to write the modification into the journal before we start making
|
||||
modifications to objects and items. As such, the reservation needs to be large
|
||||
enough to take into account the amount of metadata that the change might need to
|
||||
log in the worst case. This means that if we are modifying a btree in the
|
||||
transaction, we have to reserve enough space to record a full leaf-to-root split
|
||||
of the btree. As such, the reservations are quite complex because we have to
|
||||
take into account all the hidden changes that might occur.
|
||||
|
||||
For example, a user data extent allocation involves allocating an extent from
|
||||
free space, which modifies the free space trees. That's two btrees. Inserting
|
||||
the extent into the inode's extent map might require a split of the extent map
|
||||
btree, which requires another allocation that can modify the free space trees
|
||||
again. Then we might have to update reverse mappings, which modifies yet
|
||||
another btree which might require more space. And so on. Hence the amount of
|
||||
metadata that a "simple" operation can modify can be quite large.
|
||||
|
||||
This "worst case" calculation provides us with the static "unit reservation"
|
||||
for the transaction that is calculated at mount time. We must guarantee that the
|
||||
log has this much space available before the transaction is allowed to proceed
|
||||
so that when we come to write the dirty metadata into the log we don't run out
|
||||
of log space half way through the write.
|
||||
|
||||
For one-shot transactions, a single unit space reservation is all that is
|
||||
required for the transaction to proceed. For permanent transactions, however, we
|
||||
also have a "log count" that affects the size of the reservation that is to be
|
||||
made.
|
||||
|
||||
While a permanent transaction can get by with a single unit of space
|
||||
reservation, it is somewhat inefficient to do this as it requires the
|
||||
transaction rolling mechanism to re-reserve space on every transaction roll. We
|
||||
know from the implementation of the permanent transactions how many transaction
|
||||
rolls are likely for the common modifications that need to be made.
|
||||
|
||||
For example, and inode allocation is typically two transactions - one to
|
||||
physically allocate a free inode chunk on disk, and another to allocate an inode
|
||||
from an inode chunk that has free inodes in it. Hence for an inode allocation
|
||||
transaction, we might set the reservation log count to a value of 2 to indicate
|
||||
that the common/fast path transaction will commit two linked transactions in a
|
||||
chain. Each time a permanent transaction rolls, it consumes an entire unit
|
||||
reservation.
|
||||
|
||||
Hence when the permanent transaction is first allocated, the log space
|
||||
reservation is increases from a single unit reservation to multiple unit
|
||||
reservations. That multiple is defined by the reservation log count, and this
|
||||
means we can roll the transaction multiple times before we have to re-reserve
|
||||
log space when we roll the transaction. This ensures that the common
|
||||
modifications we make only need to reserve log space once.
|
||||
|
||||
If the log count for a permanent transaction reaches zero, then it needs to
|
||||
re-reserve physical space in the log. This is somewhat complex, and requires
|
||||
an understanding of how the log accounts for space that has been reserved.
|
||||
|
||||
|
||||
Log Space Accounting
|
||||
====================
|
||||
|
||||
The position in the log is typically referred to as a Log Sequence Number (LSN).
|
||||
The log is circular, so the positions in the log are defined by the combination
|
||||
of a cycle number - the number of times the log has been overwritten - and the
|
||||
offset into the log. A LSN carries the cycle in the upper 32 bits and the
|
||||
offset in the lower 32 bits. The offset is in units of "basic blocks" (512
|
||||
bytes). Hence we can do realtively simple LSN based math to keep track of
|
||||
available space in the log.
|
||||
|
||||
Log space accounting is done via a pair of constructs called "grant heads". The
|
||||
position of the grant heads is an absolute value, so the amount of space
|
||||
available in the log is defined by the distance between the position of the
|
||||
grant head and the current log tail. That is, how much space can be
|
||||
reserved/consumed before the grant heads would fully wrap the log and overtake
|
||||
the tail position.
|
||||
|
||||
The first grant head is the "reserve" head. This tracks the byte count of the
|
||||
reservations currently held by active transactions. It is a purely in-memory
|
||||
accounting of the space reservation and, as such, actually tracks byte offsets
|
||||
into the log rather than basic blocks. Hence it technically isn't using LSNs to
|
||||
represent the log position, but it is still treated like a split {cycle,offset}
|
||||
tuple for the purposes of tracking reservation space.
|
||||
|
||||
The reserve grant head is used to accurately account for exact transaction
|
||||
reservations amounts and the exact byte count that modifications actually make
|
||||
and need to write into the log. The reserve head is used to prevent new
|
||||
transactions from taking new reservations when the head reaches the current
|
||||
tail. It will block new reservations in a FIFO queue and as the log tail moves
|
||||
forward it will wake them in order once sufficient space is available. This FIFO
|
||||
mechanism ensures no transaction is starved of resources when log space
|
||||
shortages occur.
|
||||
|
||||
The other grant head is the "write" head. Unlike the reserve head, this grant
|
||||
head contains an LSN and it tracks the physical space usage in the log. While
|
||||
this might sound like it is accounting the same state as the reserve grant head
|
||||
- and it mostly does track exactly the same location as the reserve grant head -
|
||||
there are critical differences in behaviour between them that provides the
|
||||
forwards progress guarantees that rolling permanent transactions require.
|
||||
|
||||
These differences when a permanent transaction is rolled and the internal "log
|
||||
count" reaches zero and the initial set of unit reservations have been
|
||||
exhausted. At this point, we still require a log space reservation to continue
|
||||
the next transaction in the sequeunce, but we have none remaining. We cannot
|
||||
sleep during the transaction commit process waiting for new log space to become
|
||||
available, as we may end up on the end of the FIFO queue and the items we have
|
||||
locked while we sleep could end up pinning the tail of the log before there is
|
||||
enough free space in the log to fulfil all of the pending reservations and
|
||||
then wake up transaction commit in progress.
|
||||
|
||||
To take a new reservation without sleeping requires us to be able to take a
|
||||
reservation even if there is no reservation space currently available. That is,
|
||||
we need to be able to *overcommit* the log reservation space. As has already
|
||||
been detailed, we cannot overcommit physical log space. However, the reserve
|
||||
grant head does not track physical space - it only accounts for the amount of
|
||||
reservations we currently have outstanding. Hence if the reserve head passes
|
||||
over the tail of the log all it means is that new reservations will be throttled
|
||||
immediately and remain throttled until the log tail is moved forward far enough
|
||||
to remove the overcommit and start taking new reservations. In other words, we
|
||||
can overcommit the reserve head without violating the physical log head and tail
|
||||
rules.
|
||||
|
||||
As a result, permanent transactions only "regrant" reservation space during
|
||||
xfs_trans_commit() calls, while the physical log space reservation - tracked by
|
||||
the write head - is then reserved separately by a call to xfs_log_reserve()
|
||||
after the commit completes. Once the commit completes, we can sleep waiting for
|
||||
physical log space to be reserved from the write grant head, but only if one
|
||||
critical rule has been observed::
|
||||
|
||||
Code using permanent reservations must always log the items they hold
|
||||
locked across each transaction they roll in the chain.
|
||||
|
||||
"Re-logging" the locked items on every transaction roll ensures that the items
|
||||
attached to the transaction chain being rolled are always relocated to the
|
||||
physical head of the log and so do not pin the tail of the log. If a locked item
|
||||
pins the tail of the log when we sleep on the write reservation, then we will
|
||||
deadlock the log as we cannot take the locks needed to write back that item and
|
||||
move the tail of the log forwards to free up write grant space. Re-logging the
|
||||
locked items avoids this deadlock and guarantees that the log reservation we are
|
||||
making cannot self-deadlock.
|
||||
|
||||
If all rolling transactions obey this rule, then they can all make forwards
|
||||
progress independently because nothing will block the progress of the log
|
||||
tail moving forwards and hence ensuring that write grant space is always
|
||||
(eventually) made available to permanent transactions no matter how many times
|
||||
they roll.
|
||||
|
||||
|
||||
Re-logging Explained
|
||||
====================
|
||||
|
||||
XFS allows multiple separate modifications to a single object to be carried in
|
||||
the log at any given time. This allows the log to avoid needing to flush each
|
||||
change to disk before recording a new change to the object. XFS does this via a
|
||||
method called "re-logging". Conceptually, this is quite simple - all it requires
|
||||
is that any new change to the object is recorded with a *new copy* of all the
|
||||
existing changes in the new transaction that is written to the log.
|
||||
|
||||
That is, if we have a sequence of changes A through to F, and the object was
|
||||
written to disk after change D, we would see in the log the following series
|
||||
@@ -42,16 +327,13 @@ transaction::
|
||||
In other words, each time an object is relogged, the new transaction contains
|
||||
the aggregation of all the previous changes currently held only in the log.
|
||||
|
||||
This relogging technique also allows objects to be moved forward in the log so
|
||||
that an object being relogged does not prevent the tail of the log from ever
|
||||
moving forward. This can be seen in the table above by the changing
|
||||
(increasing) LSN of each subsequent transaction - the LSN is effectively a
|
||||
direct encoding of the location in the log of the transaction.
|
||||
This relogging technique allows objects to be moved forward in the log so that
|
||||
an object being relogged does not prevent the tail of the log from ever moving
|
||||
forward. This can be seen in the table above by the changing (increasing) LSN
|
||||
of each subsequent transaction, and it's the technique that allows us to
|
||||
implement long-running, multiple-commit permanent transactions.
|
||||
|
||||
This relogging is also used to implement long-running, multiple-commit
|
||||
transactions. These transaction are known as rolling transactions, and require
|
||||
a special log reservation known as a permanent transaction reservation. A
|
||||
typical example of a rolling transaction is the removal of extents from an
|
||||
A typical example of a rolling transaction is the removal of extents from an
|
||||
inode which can only be done at a rate of two extents per transaction because
|
||||
of reservation size limitations. Hence a rolling extent removal transaction
|
||||
keeps relogging the inode and btree buffers as they get modified in each
|
||||
@@ -67,12 +349,13 @@ the log over and over again. Worse is the fact that objects tend to get
|
||||
dirtier as they get relogged, so each subsequent transaction is writing more
|
||||
metadata into the log.
|
||||
|
||||
Another feature of the XFS transaction subsystem is that most transactions are
|
||||
asynchronous. That is, they don't commit to disk until either a log buffer is
|
||||
filled (a log buffer can hold multiple transactions) or a synchronous operation
|
||||
forces the log buffers holding the transactions to disk. This means that XFS is
|
||||
doing aggregation of transactions in memory - batching them, if you like - to
|
||||
minimise the impact of the log IO on transaction throughput.
|
||||
It should now also be obvious how relogging and asynchronous transactions go
|
||||
hand in hand. That is, transactions don't get written to the physical journal
|
||||
until either a log buffer is filled (a log buffer can hold multiple
|
||||
transactions) or a synchronous operation forces the log buffers holding the
|
||||
transactions to disk. This means that XFS is doing aggregation of transactions
|
||||
in memory - batching them, if you like - to minimise the impact of the log IO on
|
||||
transaction throughput.
|
||||
|
||||
The limitation on asynchronous transaction throughput is the number and size of
|
||||
log buffers made available by the log manager. By default there are 8 log
|
||||
|
||||
12
MAINTAINERS
12
MAINTAINERS
@@ -736,6 +736,14 @@ S: Maintained
|
||||
F: Documentation/i2c/busses/i2c-ali1563.rst
|
||||
F: drivers/i2c/busses/i2c-ali1563.c
|
||||
|
||||
ALIBABA ELASTIC RDMA DRIVER
|
||||
M: Cheng Xu <chengyou@linux.alibaba.com>
|
||||
M: Kai Shen <kaishen@linux.alibaba.com>
|
||||
L: linux-rdma@vger.kernel.org
|
||||
S: Supported
|
||||
F: drivers/infiniband/hw/erdma
|
||||
F: include/uapi/rdma/erdma-abi.h
|
||||
|
||||
ALIENWARE WMI DRIVER
|
||||
L: Dell.Client.Kernel@dell.com
|
||||
S: Maintained
|
||||
@@ -14506,7 +14514,8 @@ S: Supported
|
||||
W: http://git.infradead.org/nvme.git
|
||||
T: git://git.infradead.org/nvme.git
|
||||
F: drivers/nvme/host/
|
||||
F: include/linux/nvme.h
|
||||
F: drivers/nvme/common/
|
||||
F: include/linux/nvme*
|
||||
F: include/uapi/linux/nvme_ioctl.h
|
||||
|
||||
NVM EXPRESS FC TRANSPORT DRIVERS
|
||||
@@ -18837,6 +18846,7 @@ SOFTWARE RAID (Multiple Disks) SUPPORT
|
||||
M: Song Liu <song@kernel.org>
|
||||
L: linux-raid@vger.kernel.org
|
||||
S: Supported
|
||||
Q: https://patchwork.kernel.org/project/linux-raid/list/
|
||||
T: git git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git
|
||||
F: drivers/md/Kconfig
|
||||
F: drivers/md/Makefile
|
||||
|
||||
@@ -134,7 +134,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
|
||||
iv = bip->bip_vec + bip->bip_vcnt;
|
||||
|
||||
if (bip->bip_vcnt &&
|
||||
bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
|
||||
bvec_gap_to_prev(&bdev_get_queue(bio->bi_bdev)->limits,
|
||||
&bip->bip_vec[bip->bip_vcnt - 1], offset))
|
||||
return 0;
|
||||
|
||||
|
||||
51
block/bio.c
51
block/bio.c
@@ -968,7 +968,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
|
||||
* would create a gap, disallow it.
|
||||
*/
|
||||
bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
|
||||
if (bvec_gap_to_prev(q, bvec, offset))
|
||||
if (bvec_gap_to_prev(&q->limits, bvec, offset))
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1154,22 +1154,12 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
|
||||
bio_set_flag(bio, BIO_CLONED);
|
||||
}
|
||||
|
||||
static void bio_put_pages(struct page **pages, size_t size, size_t off)
|
||||
{
|
||||
size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
|
||||
|
||||
for (i = 0; i < nr; i++)
|
||||
put_page(pages[i]);
|
||||
}
|
||||
|
||||
static int bio_iov_add_page(struct bio *bio, struct page *page,
|
||||
unsigned int len, unsigned int offset)
|
||||
{
|
||||
bool same_page = false;
|
||||
|
||||
if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
|
||||
if (WARN_ON_ONCE(bio_full(bio, len)))
|
||||
return -EINVAL;
|
||||
__bio_add_page(bio, page, len, offset);
|
||||
return 0;
|
||||
}
|
||||
@@ -1212,8 +1202,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
|
||||
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
|
||||
struct page **pages = (struct page **)bv;
|
||||
ssize_t size, left;
|
||||
unsigned len, i;
|
||||
unsigned len, i = 0;
|
||||
size_t offset;
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
* Move page array up in the allocated memory for the bio vecs as far as
|
||||
@@ -1230,32 +1221,40 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
|
||||
* result to ensure the bio's total size is correct. The remainder of
|
||||
* the iov data will be picked up in the next bio iteration.
|
||||
*/
|
||||
size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
|
||||
if (size > 0)
|
||||
size = iov_iter_get_pages(iter, pages, UINT_MAX - bio->bi_iter.bi_size,
|
||||
nr_pages, &offset);
|
||||
if (size > 0) {
|
||||
nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
|
||||
size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
|
||||
if (unlikely(size <= 0))
|
||||
return size ? size : -EFAULT;
|
||||
} else
|
||||
nr_pages = 0;
|
||||
|
||||
if (unlikely(size <= 0)) {
|
||||
ret = size ? size : -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
for (left = size, i = 0; left > 0; left -= len, i++) {
|
||||
struct page *page = pages[i];
|
||||
int ret;
|
||||
|
||||
len = min_t(size_t, PAGE_SIZE - offset, left);
|
||||
if (bio_op(bio) == REQ_OP_ZONE_APPEND)
|
||||
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
|
||||
ret = bio_iov_add_zone_append_page(bio, page, len,
|
||||
offset);
|
||||
else
|
||||
ret = bio_iov_add_page(bio, page, len, offset);
|
||||
if (ret)
|
||||
break;
|
||||
} else
|
||||
bio_iov_add_page(bio, page, len, offset);
|
||||
|
||||
if (ret) {
|
||||
bio_put_pages(pages + i, left, offset);
|
||||
return ret;
|
||||
}
|
||||
offset = 0;
|
||||
}
|
||||
|
||||
iov_iter_advance(iter, size);
|
||||
return 0;
|
||||
iov_iter_advance(iter, size - left);
|
||||
out:
|
||||
while (i < nr_pages)
|
||||
put_page(pages[i++]);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -377,7 +377,6 @@ static void blk_timeout_work(struct work_struct *work)
|
||||
struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
|
||||
{
|
||||
struct request_queue *q;
|
||||
int ret;
|
||||
|
||||
q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
|
||||
GFP_KERNEL | __GFP_ZERO, node_id);
|
||||
@@ -396,13 +395,9 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
|
||||
if (q->id < 0)
|
||||
goto fail_srcu;
|
||||
|
||||
ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
|
||||
if (ret)
|
||||
goto fail_id;
|
||||
|
||||
q->stats = blk_alloc_queue_stats();
|
||||
if (!q->stats)
|
||||
goto fail_split;
|
||||
goto fail_id;
|
||||
|
||||
q->node = node_id;
|
||||
|
||||
@@ -439,8 +434,6 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
|
||||
|
||||
fail_stats:
|
||||
blk_free_queue_stats(q->stats);
|
||||
fail_split:
|
||||
bioset_exit(&q->bio_split);
|
||||
fail_id:
|
||||
ida_free(&blk_queue_ida, q->id);
|
||||
fail_srcu:
|
||||
|
||||
@@ -82,7 +82,7 @@ static inline bool bio_will_gap(struct request_queue *q,
|
||||
bio_get_first_bvec(next, &nb);
|
||||
if (biovec_phys_mergeable(q, &pb, &nb))
|
||||
return false;
|
||||
return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
|
||||
return __bvec_gap_to_prev(&q->limits, &pb, nb.bv_offset);
|
||||
}
|
||||
|
||||
static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
|
||||
@@ -95,23 +95,30 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
|
||||
return bio_will_gap(req->q, NULL, bio, req->bio);
|
||||
}
|
||||
|
||||
static struct bio *blk_bio_discard_split(struct request_queue *q,
|
||||
struct bio *bio,
|
||||
struct bio_set *bs,
|
||||
unsigned *nsegs)
|
||||
/*
|
||||
* The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size
|
||||
* is defined as 'unsigned int', meantime it has to be aligned to with the
|
||||
* logical block size, which is the minimum accepted unit by hardware.
|
||||
*/
|
||||
static unsigned int bio_allowed_max_sectors(struct queue_limits *lim)
|
||||
{
|
||||
return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT;
|
||||
}
|
||||
|
||||
static struct bio *bio_split_discard(struct bio *bio, struct queue_limits *lim,
|
||||
unsigned *nsegs, struct bio_set *bs)
|
||||
{
|
||||
unsigned int max_discard_sectors, granularity;
|
||||
int alignment;
|
||||
sector_t tmp;
|
||||
unsigned split_sectors;
|
||||
|
||||
*nsegs = 1;
|
||||
|
||||
/* Zero-sector (unknown) and one-sector granularities are the same. */
|
||||
granularity = max(q->limits.discard_granularity >> 9, 1U);
|
||||
granularity = max(lim->discard_granularity >> 9, 1U);
|
||||
|
||||
max_discard_sectors = min(q->limits.max_discard_sectors,
|
||||
bio_allowed_max_sectors(q));
|
||||
max_discard_sectors =
|
||||
min(lim->max_discard_sectors, bio_allowed_max_sectors(lim));
|
||||
max_discard_sectors -= max_discard_sectors % granularity;
|
||||
|
||||
if (unlikely(!max_discard_sectors)) {
|
||||
@@ -128,9 +135,8 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
|
||||
* If the next starting sector would be misaligned, stop the discard at
|
||||
* the previous aligned sector.
|
||||
*/
|
||||
alignment = (q->limits.discard_alignment >> 9) % granularity;
|
||||
|
||||
tmp = bio->bi_iter.bi_sector + split_sectors - alignment;
|
||||
tmp = bio->bi_iter.bi_sector + split_sectors -
|
||||
((lim->discard_alignment >> 9) % granularity);
|
||||
tmp = sector_div(tmp, granularity);
|
||||
|
||||
if (split_sectors > tmp)
|
||||
@@ -139,18 +145,15 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
|
||||
return bio_split(bio, split_sectors, GFP_NOIO, bs);
|
||||
}
|
||||
|
||||
static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
|
||||
struct bio *bio, struct bio_set *bs, unsigned *nsegs)
|
||||
static struct bio *bio_split_write_zeroes(struct bio *bio,
|
||||
struct queue_limits *lim, unsigned *nsegs, struct bio_set *bs)
|
||||
{
|
||||
*nsegs = 0;
|
||||
|
||||
if (!q->limits.max_write_zeroes_sectors)
|
||||
if (!lim->max_write_zeroes_sectors)
|
||||
return NULL;
|
||||
|
||||
if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors)
|
||||
if (bio_sectors(bio) <= lim->max_write_zeroes_sectors)
|
||||
return NULL;
|
||||
|
||||
return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
|
||||
return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -161,17 +164,17 @@ static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
|
||||
* requests that are submitted to a block device if the start of a bio is not
|
||||
* aligned to a physical block boundary.
|
||||
*/
|
||||
static inline unsigned get_max_io_size(struct request_queue *q,
|
||||
struct bio *bio)
|
||||
static inline unsigned get_max_io_size(struct bio *bio,
|
||||
struct queue_limits *lim)
|
||||
{
|
||||
unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
|
||||
unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
|
||||
unsigned max_sectors = queue_max_sectors(q), start, end;
|
||||
unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT;
|
||||
unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT;
|
||||
unsigned max_sectors = lim->max_sectors, start, end;
|
||||
|
||||
if (q->limits.chunk_sectors) {
|
||||
if (lim->chunk_sectors) {
|
||||
max_sectors = min(max_sectors,
|
||||
blk_chunk_sectors_left(bio->bi_iter.bi_sector,
|
||||
q->limits.chunk_sectors));
|
||||
lim->chunk_sectors));
|
||||
}
|
||||
|
||||
start = bio->bi_iter.bi_sector & (pbs - 1);
|
||||
@@ -181,11 +184,10 @@ static inline unsigned get_max_io_size(struct request_queue *q,
|
||||
return max_sectors & ~(lbs - 1);
|
||||
}
|
||||
|
||||
static inline unsigned get_max_segment_size(const struct request_queue *q,
|
||||
struct page *start_page,
|
||||
unsigned long offset)
|
||||
static inline unsigned get_max_segment_size(struct queue_limits *lim,
|
||||
struct page *start_page, unsigned long offset)
|
||||
{
|
||||
unsigned long mask = queue_segment_boundary(q);
|
||||
unsigned long mask = lim->seg_boundary_mask;
|
||||
|
||||
offset = mask & (page_to_phys(start_page) + offset);
|
||||
|
||||
@@ -194,12 +196,12 @@ static inline unsigned get_max_segment_size(const struct request_queue *q,
|
||||
* on 32bit arch, use queue's max segment size when that happens.
|
||||
*/
|
||||
return min_not_zero(mask - offset + 1,
|
||||
(unsigned long)queue_max_segment_size(q));
|
||||
(unsigned long)lim->max_segment_size);
|
||||
}
|
||||
|
||||
/**
|
||||
* bvec_split_segs - verify whether or not a bvec should be split in the middle
|
||||
* @q: [in] request queue associated with the bio associated with @bv
|
||||
* @lim: [in] queue limits to split based on
|
||||
* @bv: [in] bvec to examine
|
||||
* @nsegs: [in,out] Number of segments in the bio being built. Incremented
|
||||
* by the number of segments from @bv that may be appended to that
|
||||
@@ -217,10 +219,9 @@ static inline unsigned get_max_segment_size(const struct request_queue *q,
|
||||
* *@nsegs segments and *@sectors sectors would make that bio unacceptable for
|
||||
* the block driver.
|
||||
*/
|
||||
static bool bvec_split_segs(const struct request_queue *q,
|
||||
const struct bio_vec *bv, unsigned *nsegs,
|
||||
unsigned *bytes, unsigned max_segs,
|
||||
unsigned max_bytes)
|
||||
static bool bvec_split_segs(struct queue_limits *lim, const struct bio_vec *bv,
|
||||
unsigned *nsegs, unsigned *bytes, unsigned max_segs,
|
||||
unsigned max_bytes)
|
||||
{
|
||||
unsigned max_len = min(max_bytes, UINT_MAX) - *bytes;
|
||||
unsigned len = min(bv->bv_len, max_len);
|
||||
@@ -228,7 +229,7 @@ static bool bvec_split_segs(const struct request_queue *q,
|
||||
unsigned seg_size = 0;
|
||||
|
||||
while (len && *nsegs < max_segs) {
|
||||
seg_size = get_max_segment_size(q, bv->bv_page,
|
||||
seg_size = get_max_segment_size(lim, bv->bv_page,
|
||||
bv->bv_offset + total_len);
|
||||
seg_size = min(seg_size, len);
|
||||
|
||||
@@ -236,7 +237,7 @@ static bool bvec_split_segs(const struct request_queue *q,
|
||||
total_len += seg_size;
|
||||
len -= seg_size;
|
||||
|
||||
if ((bv->bv_offset + total_len) & queue_virt_boundary(q))
|
||||
if ((bv->bv_offset + total_len) & lim->virt_boundary_mask)
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -247,16 +248,17 @@ static bool bvec_split_segs(const struct request_queue *q,
|
||||
}
|
||||
|
||||
/**
|
||||
* blk_bio_segment_split - split a bio in two bios
|
||||
* @q: [in] request queue pointer
|
||||
* bio_split_rw - split a bio in two bios
|
||||
* @bio: [in] bio to be split
|
||||
* @bs: [in] bio set to allocate the clone from
|
||||
* @lim: [in] queue limits to split based on
|
||||
* @segs: [out] number of segments in the bio with the first half of the sectors
|
||||
* @bs: [in] bio set to allocate the clone from
|
||||
* @max_bytes: [in] maximum number of bytes per bio
|
||||
*
|
||||
* Clone @bio, update the bi_iter of the clone to represent the first sectors
|
||||
* of @bio and update @bio->bi_iter to represent the remaining sectors. The
|
||||
* following is guaranteed for the cloned bio:
|
||||
* - That it has at most get_max_io_size(@q, @bio) sectors.
|
||||
* - That it has at most @max_bytes worth of data
|
||||
* - That it has at most queue_max_segments(@q) segments.
|
||||
*
|
||||
* Except for discard requests the cloned bio will point at the bi_io_vec of
|
||||
@@ -265,33 +267,30 @@ static bool bvec_split_segs(const struct request_queue *q,
|
||||
* responsible for ensuring that @bs is only destroyed after processing of the
|
||||
* split bio has finished.
|
||||
*/
|
||||
static struct bio *blk_bio_segment_split(struct request_queue *q,
|
||||
struct bio *bio,
|
||||
struct bio_set *bs,
|
||||
unsigned *segs)
|
||||
static struct bio *bio_split_rw(struct bio *bio, struct queue_limits *lim,
|
||||
unsigned *segs, struct bio_set *bs, unsigned max_bytes)
|
||||
{
|
||||
struct bio_vec bv, bvprv, *bvprvp = NULL;
|
||||
struct bvec_iter iter;
|
||||
unsigned nsegs = 0, bytes = 0;
|
||||
const unsigned max_bytes = get_max_io_size(q, bio) << 9;
|
||||
const unsigned max_segs = queue_max_segments(q);
|
||||
|
||||
bio_for_each_bvec(bv, bio, iter) {
|
||||
/*
|
||||
* If the queue doesn't support SG gaps and adding this
|
||||
* offset would create a gap, disallow it.
|
||||
*/
|
||||
if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
|
||||
if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
|
||||
goto split;
|
||||
|
||||
if (nsegs < max_segs &&
|
||||
if (nsegs < lim->max_segments &&
|
||||
bytes + bv.bv_len <= max_bytes &&
|
||||
bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
|
||||
nsegs++;
|
||||
bytes += bv.bv_len;
|
||||
} else if (bvec_split_segs(q, &bv, &nsegs, &bytes, max_segs,
|
||||
max_bytes)) {
|
||||
goto split;
|
||||
} else {
|
||||
if (bvec_split_segs(lim, &bv, &nsegs, &bytes,
|
||||
lim->max_segments, max_bytes))
|
||||
goto split;
|
||||
}
|
||||
|
||||
bvprv = bv;
|
||||
@@ -308,7 +307,7 @@ split:
|
||||
* split size so that each bio is properly block size aligned, even if
|
||||
* we do not use the full hardware limits.
|
||||
*/
|
||||
bytes = ALIGN_DOWN(bytes, queue_logical_block_size(q));
|
||||
bytes = ALIGN_DOWN(bytes, lim->logical_block_size);
|
||||
|
||||
/*
|
||||
* Bio splitting may cause subtle trouble such as hang when doing sync
|
||||
@@ -320,34 +319,35 @@ split:
|
||||
}
|
||||
|
||||
/**
|
||||
* __blk_queue_split - split a bio and submit the second half
|
||||
* @q: [in] request_queue new bio is being queued at
|
||||
* @bio: [in, out] bio to be split
|
||||
* @nr_segs: [out] number of segments in the first bio
|
||||
* __bio_split_to_limits - split a bio to fit the queue limits
|
||||
* @bio: bio to be split
|
||||
* @lim: queue limits to split based on
|
||||
* @nr_segs: returns the number of segments in the returned bio
|
||||
*
|
||||
* Split a bio into two bios, chain the two bios, submit the second half and
|
||||
* store a pointer to the first half in *@bio. If the second bio is still too
|
||||
* big it will be split by a recursive call to this function. Since this
|
||||
* function may allocate a new bio from q->bio_split, it is the responsibility
|
||||
* of the caller to ensure that q->bio_split is only released after processing
|
||||
* of the split bio has finished.
|
||||
* Check if @bio needs splitting based on the queue limits, and if so split off
|
||||
* a bio fitting the limits from the beginning of @bio and return it. @bio is
|
||||
* shortened to the remainder and re-submitted.
|
||||
*
|
||||
* The split bio is allocated from @q->bio_split, which is provided by the
|
||||
* block layer.
|
||||
*/
|
||||
void __blk_queue_split(struct request_queue *q, struct bio **bio,
|
||||
struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
|
||||
unsigned int *nr_segs)
|
||||
{
|
||||
struct bio *split = NULL;
|
||||
struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split;
|
||||
struct bio *split;
|
||||
|
||||
switch (bio_op(*bio)) {
|
||||
switch (bio_op(bio)) {
|
||||
case REQ_OP_DISCARD:
|
||||
case REQ_OP_SECURE_ERASE:
|
||||
split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs);
|
||||
split = bio_split_discard(bio, lim, nr_segs, bs);
|
||||
break;
|
||||
case REQ_OP_WRITE_ZEROES:
|
||||
split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split,
|
||||
nr_segs);
|
||||
split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
|
||||
break;
|
||||
default:
|
||||
split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
|
||||
split = bio_split_rw(bio, lim, nr_segs, bs,
|
||||
get_max_io_size(bio, lim) << SECTOR_SHIFT);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -356,32 +356,35 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio,
|
||||
split->bi_opf |= REQ_NOMERGE;
|
||||
|
||||
blkcg_bio_issue_init(split);
|
||||
bio_chain(split, *bio);
|
||||
trace_block_split(split, (*bio)->bi_iter.bi_sector);
|
||||
submit_bio_noacct(*bio);
|
||||
*bio = split;
|
||||
bio_chain(split, bio);
|
||||
trace_block_split(split, bio->bi_iter.bi_sector);
|
||||
submit_bio_noacct(bio);
|
||||
return split;
|
||||
}
|
||||
return bio;
|
||||
}
|
||||
|
||||
/**
|
||||
* blk_queue_split - split a bio and submit the second half
|
||||
* @bio: [in, out] bio to be split
|
||||
* bio_split_to_limits - split a bio to fit the queue limits
|
||||
* @bio: bio to be split
|
||||
*
|
||||
* Split a bio into two bios, chains the two bios, submit the second half and
|
||||
* store a pointer to the first half in *@bio. Since this function may allocate
|
||||
* a new bio from q->bio_split, it is the responsibility of the caller to ensure
|
||||
* that q->bio_split is only released after processing of the split bio has
|
||||
* finished.
|
||||
* Check if @bio needs splitting based on the queue limits of @bio->bi_bdev, and
|
||||
* if so split off a bio fitting the limits from the beginning of @bio and
|
||||
* return it. @bio is shortened to the remainder and re-submitted.
|
||||
*
|
||||
* The split bio is allocated from @q->bio_split, which is provided by the
|
||||
* block layer.
|
||||
*/
|
||||
void blk_queue_split(struct bio **bio)
|
||||
struct bio *bio_split_to_limits(struct bio *bio)
|
||||
{
|
||||
struct request_queue *q = bdev_get_queue((*bio)->bi_bdev);
|
||||
struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
|
||||
unsigned int nr_segs;
|
||||
|
||||
if (blk_may_split(q, *bio))
|
||||
__blk_queue_split(q, bio, &nr_segs);
|
||||
if (bio_may_exceed_limits(bio, lim))
|
||||
return __bio_split_to_limits(bio, lim, &nr_segs);
|
||||
return bio;
|
||||
}
|
||||
EXPORT_SYMBOL(blk_queue_split);
|
||||
EXPORT_SYMBOL(bio_split_to_limits);
|
||||
|
||||
unsigned int blk_recalc_rq_segments(struct request *rq)
|
||||
{
|
||||
@@ -411,7 +414,7 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
|
||||
}
|
||||
|
||||
rq_for_each_bvec(bv, rq, iter)
|
||||
bvec_split_segs(rq->q, &bv, &nr_phys_segs, &bytes,
|
||||
bvec_split_segs(&rq->q->limits, &bv, &nr_phys_segs, &bytes,
|
||||
UINT_MAX, UINT_MAX);
|
||||
return nr_phys_segs;
|
||||
}
|
||||
@@ -442,8 +445,8 @@ static unsigned blk_bvec_map_sg(struct request_queue *q,
|
||||
|
||||
while (nbytes > 0) {
|
||||
unsigned offset = bvec->bv_offset + total;
|
||||
unsigned len = min(get_max_segment_size(q, bvec->bv_page,
|
||||
offset), nbytes);
|
||||
unsigned len = min(get_max_segment_size(&q->limits,
|
||||
bvec->bv_page, offset), nbytes);
|
||||
struct page *page = bvec->bv_page;
|
||||
|
||||
/*
|
||||
|
||||
@@ -2815,9 +2815,9 @@ void blk_mq_submit_bio(struct bio *bio)
|
||||
unsigned int nr_segs = 1;
|
||||
blk_status_t ret;
|
||||
|
||||
blk_queue_bounce(q, &bio);
|
||||
if (blk_may_split(q, bio))
|
||||
__blk_queue_split(q, &bio, &nr_segs);
|
||||
bio = blk_queue_bounce(bio, q);
|
||||
if (bio_may_exceed_limits(bio, &q->limits))
|
||||
bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
|
||||
|
||||
if (!bio_integrity_prep(bio))
|
||||
return;
|
||||
|
||||
@@ -779,8 +779,6 @@ static void blk_release_queue(struct kobject *kobj)
|
||||
if (queue_is_mq(q))
|
||||
blk_mq_release(q);
|
||||
|
||||
bioset_exit(&q->bio_split);
|
||||
|
||||
if (blk_queue_has_srcu(q))
|
||||
cleanup_srcu_struct(q->srcu);
|
||||
|
||||
|
||||
47
block/blk.h
47
block/blk.h
@@ -97,23 +97,23 @@ static inline bool biovec_phys_mergeable(struct request_queue *q,
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool __bvec_gap_to_prev(struct request_queue *q,
|
||||
static inline bool __bvec_gap_to_prev(struct queue_limits *lim,
|
||||
struct bio_vec *bprv, unsigned int offset)
|
||||
{
|
||||
return (offset & queue_virt_boundary(q)) ||
|
||||
((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
|
||||
return (offset & lim->virt_boundary_mask) ||
|
||||
((bprv->bv_offset + bprv->bv_len) & lim->virt_boundary_mask);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if adding a bio_vec after bprv with offset would create a gap in
|
||||
* the SG list. Most drivers don't care about this, but some do.
|
||||
*/
|
||||
static inline bool bvec_gap_to_prev(struct request_queue *q,
|
||||
static inline bool bvec_gap_to_prev(struct queue_limits *lim,
|
||||
struct bio_vec *bprv, unsigned int offset)
|
||||
{
|
||||
if (!queue_virt_boundary(q))
|
||||
if (!lim->virt_boundary_mask)
|
||||
return false;
|
||||
return __bvec_gap_to_prev(q, bprv, offset);
|
||||
return __bvec_gap_to_prev(lim, bprv, offset);
|
||||
}
|
||||
|
||||
static inline bool rq_mergeable(struct request *rq)
|
||||
@@ -189,7 +189,8 @@ static inline bool integrity_req_gap_back_merge(struct request *req,
|
||||
struct bio_integrity_payload *bip = bio_integrity(req->bio);
|
||||
struct bio_integrity_payload *bip_next = bio_integrity(next);
|
||||
|
||||
return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
|
||||
return bvec_gap_to_prev(&req->q->limits,
|
||||
&bip->bip_vec[bip->bip_vcnt - 1],
|
||||
bip_next->bip_vec[0].bv_offset);
|
||||
}
|
||||
|
||||
@@ -199,7 +200,8 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
|
||||
struct bio_integrity_payload *bip = bio_integrity(bio);
|
||||
struct bio_integrity_payload *bip_next = bio_integrity(req->bio);
|
||||
|
||||
return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
|
||||
return bvec_gap_to_prev(&req->q->limits,
|
||||
&bip->bip_vec[bip->bip_vcnt - 1],
|
||||
bip_next->bip_vec[0].bv_offset);
|
||||
}
|
||||
|
||||
@@ -288,7 +290,8 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
|
||||
ssize_t part_timeout_store(struct device *, struct device_attribute *,
|
||||
const char *, size_t);
|
||||
|
||||
static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
|
||||
static inline bool bio_may_exceed_limits(struct bio *bio,
|
||||
struct queue_limits *lim)
|
||||
{
|
||||
switch (bio_op(bio)) {
|
||||
case REQ_OP_DISCARD:
|
||||
@@ -307,12 +310,12 @@ static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
|
||||
* to the performance impact of cloned bios themselves the loop below
|
||||
* doesn't matter anyway.
|
||||
*/
|
||||
return q->limits.chunk_sectors || bio->bi_vcnt != 1 ||
|
||||
return lim->chunk_sectors || bio->bi_vcnt != 1 ||
|
||||
bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
|
||||
}
|
||||
|
||||
void __blk_queue_split(struct request_queue *q, struct bio **bio,
|
||||
unsigned int *nr_segs);
|
||||
struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
|
||||
unsigned int *nr_segs);
|
||||
int ll_back_merge_fn(struct request *req, struct bio *bio,
|
||||
unsigned int nr_segs);
|
||||
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
|
||||
@@ -344,16 +347,6 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
|
||||
q->last_merge = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size
|
||||
* is defined as 'unsigned int', meantime it has to aligned to with logical
|
||||
* block size which is the minimum accepted unit by hardware.
|
||||
*/
|
||||
static inline unsigned int bio_allowed_max_sectors(struct request_queue *q)
|
||||
{
|
||||
return round_down(UINT_MAX, queue_logical_block_size(q)) >> 9;
|
||||
}
|
||||
|
||||
/*
|
||||
* Internal io_context interface
|
||||
*/
|
||||
@@ -378,7 +371,7 @@ static inline void blk_throtl_bio_endio(struct bio *bio) { }
|
||||
static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
|
||||
#endif
|
||||
|
||||
void __blk_queue_bounce(struct request_queue *q, struct bio **bio);
|
||||
struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q);
|
||||
|
||||
static inline bool blk_queue_may_bounce(struct request_queue *q)
|
||||
{
|
||||
@@ -387,10 +380,12 @@ static inline bool blk_queue_may_bounce(struct request_queue *q)
|
||||
max_low_pfn >= max_pfn;
|
||||
}
|
||||
|
||||
static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
|
||||
static inline struct bio *blk_queue_bounce(struct bio *bio,
|
||||
struct request_queue *q)
|
||||
{
|
||||
if (unlikely(blk_queue_may_bounce(q) && bio_has_data(*bio)))
|
||||
__blk_queue_bounce(q, bio);
|
||||
if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio)))
|
||||
return __blk_queue_bounce(bio, q);
|
||||
return bio;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BLK_CGROUP_IOLATENCY
|
||||
|
||||
@@ -199,24 +199,24 @@ err_put:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
|
||||
struct bio *__blk_queue_bounce(struct bio *bio_orig, struct request_queue *q)
|
||||
{
|
||||
struct bio *bio;
|
||||
int rw = bio_data_dir(*bio_orig);
|
||||
int rw = bio_data_dir(bio_orig);
|
||||
struct bio_vec *to, from;
|
||||
struct bvec_iter iter;
|
||||
unsigned i = 0, bytes = 0;
|
||||
bool bounce = false;
|
||||
int sectors;
|
||||
|
||||
bio_for_each_segment(from, *bio_orig, iter) {
|
||||
bio_for_each_segment(from, bio_orig, iter) {
|
||||
if (i++ < BIO_MAX_VECS)
|
||||
bytes += from.bv_len;
|
||||
if (PageHighMem(from.bv_page))
|
||||
bounce = true;
|
||||
}
|
||||
if (!bounce)
|
||||
return;
|
||||
return bio_orig;
|
||||
|
||||
/*
|
||||
* Individual bvecs might not be logical block aligned. Round down
|
||||
@@ -225,13 +225,13 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
|
||||
*/
|
||||
sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >>
|
||||
SECTOR_SHIFT;
|
||||
if (sectors < bio_sectors(*bio_orig)) {
|
||||
bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
|
||||
bio_chain(bio, *bio_orig);
|
||||
submit_bio_noacct(*bio_orig);
|
||||
*bio_orig = bio;
|
||||
if (sectors < bio_sectors(bio_orig)) {
|
||||
bio = bio_split(bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
|
||||
bio_chain(bio, bio_orig);
|
||||
submit_bio_noacct(bio_orig);
|
||||
bio_orig = bio;
|
||||
}
|
||||
bio = bounce_clone_bio(*bio_orig);
|
||||
bio = bounce_clone_bio(bio_orig);
|
||||
|
||||
/*
|
||||
* Bvec table can't be updated by bio_for_each_segment_all(),
|
||||
@@ -254,7 +254,7 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
|
||||
to->bv_page = bounce_page;
|
||||
}
|
||||
|
||||
trace_block_bio_bounce(*bio_orig);
|
||||
trace_block_bio_bounce(bio_orig);
|
||||
|
||||
bio->bi_flags |= (1 << BIO_BOUNCED);
|
||||
|
||||
@@ -263,6 +263,6 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
|
||||
else
|
||||
bio->bi_end_io = bounce_end_io_write;
|
||||
|
||||
bio->bi_private = *bio_orig;
|
||||
*bio_orig = bio;
|
||||
bio->bi_private = bio_orig;
|
||||
return bio;
|
||||
}
|
||||
|
||||
@@ -1151,6 +1151,7 @@ static void disk_release(struct device *dev)
|
||||
blk_mq_exit_queue(disk->queue);
|
||||
|
||||
blkcg_exit_queue(disk->queue);
|
||||
bioset_exit(&disk->bio_split);
|
||||
|
||||
disk_release_events(disk);
|
||||
kfree(disk->random);
|
||||
@@ -1342,9 +1343,12 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
|
||||
if (!disk)
|
||||
goto out_put_queue;
|
||||
|
||||
if (bioset_init(&disk->bio_split, BIO_POOL_SIZE, 0, 0))
|
||||
goto out_free_disk;
|
||||
|
||||
disk->bdi = bdi_alloc(node_id);
|
||||
if (!disk->bdi)
|
||||
goto out_free_disk;
|
||||
goto out_free_bioset;
|
||||
|
||||
/* bdev_alloc() might need the queue, set before the first call */
|
||||
disk->queue = q;
|
||||
@@ -1382,6 +1386,8 @@ out_destroy_part_tbl:
|
||||
iput(disk->part0->bd_inode);
|
||||
out_free_bdi:
|
||||
bdi_put(disk->bdi);
|
||||
out_free_bioset:
|
||||
bioset_exit(&disk->bio_split);
|
||||
out_free_disk:
|
||||
kfree(disk);
|
||||
out_put_queue:
|
||||
|
||||
@@ -104,6 +104,12 @@ int crypto_grab_kpp(struct crypto_kpp_spawn *spawn,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(crypto_grab_kpp);
|
||||
|
||||
int crypto_has_kpp(const char *alg_name, u32 type, u32 mask)
|
||||
{
|
||||
return crypto_type_has_alg(alg_name, &crypto_kpp_type, type, mask);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(crypto_has_kpp);
|
||||
|
||||
static void kpp_prepare_alg(struct kpp_alg *alg)
|
||||
{
|
||||
struct crypto_alg *base = &alg->base;
|
||||
|
||||
@@ -521,6 +521,12 @@ struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(crypto_alloc_shash);
|
||||
|
||||
int crypto_has_shash(const char *alg_name, u32 type, u32 mask)
|
||||
{
|
||||
return crypto_type_has_alg(alg_name, &crypto_shash_type, type, mask);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(crypto_has_shash);
|
||||
|
||||
static int shash_prepare_alg(struct shash_alg *alg)
|
||||
{
|
||||
struct crypto_alg *base = &alg->base;
|
||||
|
||||
@@ -248,15 +248,6 @@ config BLK_DEV_NBD
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config BLK_DEV_SX8
|
||||
tristate "Promise SATA SX8 support"
|
||||
depends on PCI
|
||||
help
|
||||
Saying Y or M here will enable support for the
|
||||
Promise SATA SX8 controllers.
|
||||
|
||||
Use devices /dev/sx8/$N and /dev/sx8/$Np$M.
|
||||
|
||||
config BLK_DEV_RAM
|
||||
tristate "RAM block device support"
|
||||
help
|
||||
|
||||
@@ -26,8 +26,6 @@ obj-$(CONFIG_SUNVDC) += sunvdc.o
|
||||
obj-$(CONFIG_BLK_DEV_NBD) += nbd.o
|
||||
obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o
|
||||
|
||||
obj-$(CONFIG_BLK_DEV_SX8) += sx8.o
|
||||
|
||||
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
|
||||
obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/
|
||||
obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
|
||||
|
||||
@@ -974,25 +974,58 @@ static void drbd_bm_endio(struct bio *bio)
|
||||
}
|
||||
}
|
||||
|
||||
/* For the layout, see comment above drbd_md_set_sector_offsets(). */
|
||||
static inline sector_t drbd_md_last_bitmap_sector(struct drbd_backing_dev *bdev)
|
||||
{
|
||||
switch (bdev->md.meta_dev_idx) {
|
||||
case DRBD_MD_INDEX_INTERNAL:
|
||||
case DRBD_MD_INDEX_FLEX_INT:
|
||||
return bdev->md.md_offset + bdev->md.al_offset -1;
|
||||
case DRBD_MD_INDEX_FLEX_EXT:
|
||||
default:
|
||||
return bdev->md.md_offset + bdev->md.md_size_sect -1;
|
||||
}
|
||||
}
|
||||
|
||||
static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
|
||||
{
|
||||
struct drbd_device *device = ctx->device;
|
||||
enum req_op op = ctx->flags & BM_AIO_READ ? REQ_OP_READ : REQ_OP_WRITE;
|
||||
struct bio *bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op,
|
||||
GFP_NOIO, &drbd_md_io_bio_set);
|
||||
struct drbd_bitmap *b = device->bitmap;
|
||||
struct bio *bio;
|
||||
struct page *page;
|
||||
sector_t last_bm_sect;
|
||||
sector_t first_bm_sect;
|
||||
sector_t on_disk_sector;
|
||||
unsigned int len;
|
||||
|
||||
sector_t on_disk_sector =
|
||||
device->ldev->md.md_offset + device->ldev->md.bm_offset;
|
||||
on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
|
||||
first_bm_sect = device->ldev->md.md_offset + device->ldev->md.bm_offset;
|
||||
on_disk_sector = first_bm_sect + (((sector_t)page_nr) << (PAGE_SHIFT-SECTOR_SHIFT));
|
||||
|
||||
/* this might happen with very small
|
||||
* flexible external meta data device,
|
||||
* or with PAGE_SIZE > 4k */
|
||||
len = min_t(unsigned int, PAGE_SIZE,
|
||||
(drbd_md_last_sector(device->ldev) - on_disk_sector + 1)<<9);
|
||||
last_bm_sect = drbd_md_last_bitmap_sector(device->ldev);
|
||||
if (first_bm_sect <= on_disk_sector && last_bm_sect >= on_disk_sector) {
|
||||
sector_t len_sect = last_bm_sect - on_disk_sector + 1;
|
||||
if (len_sect < PAGE_SIZE/SECTOR_SIZE)
|
||||
len = (unsigned int)len_sect*SECTOR_SIZE;
|
||||
else
|
||||
len = PAGE_SIZE;
|
||||
} else {
|
||||
if (__ratelimit(&drbd_ratelimit_state)) {
|
||||
drbd_err(device, "Invalid offset during on-disk bitmap access: "
|
||||
"page idx %u, sector %llu\n", page_nr, on_disk_sector);
|
||||
}
|
||||
ctx->error = -EIO;
|
||||
bm_set_page_io_err(b->bm_pages[page_nr]);
|
||||
if (atomic_dec_and_test(&ctx->in_flight)) {
|
||||
ctx->done = 1;
|
||||
wake_up(&device->misc_wait);
|
||||
kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* serialize IO on this page */
|
||||
bm_page_lock_io(device, page_nr);
|
||||
@@ -1007,6 +1040,8 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho
|
||||
bm_store_page_idx(page, page_nr);
|
||||
} else
|
||||
page = b->bm_pages[page_nr];
|
||||
bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op, GFP_NOIO,
|
||||
&drbd_md_io_bio_set);
|
||||
bio->bi_iter.bi_sector = on_disk_sector;
|
||||
/* bio_add_page of a single page to an empty bio will always succeed,
|
||||
* according to api. Do we want to assert that? */
|
||||
|
||||
@@ -1608,7 +1608,7 @@ void drbd_submit_bio(struct bio *bio)
|
||||
{
|
||||
struct drbd_device *device = bio->bi_bdev->bd_disk->private_data;
|
||||
|
||||
blk_queue_split(&bio);
|
||||
bio = bio_split_to_limits(bio);
|
||||
|
||||
/*
|
||||
* what we "blindly" assume:
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
* (part of code stolen from loop.c)
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) "nbd: " fmt
|
||||
|
||||
#include <linux/major.h>
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
@@ -1950,7 +1952,7 @@ again:
|
||||
test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) ||
|
||||
!refcount_inc_not_zero(&nbd->refs)) {
|
||||
mutex_unlock(&nbd_index_mutex);
|
||||
pr_err("nbd: device at index %d is going down\n",
|
||||
pr_err("device at index %d is going down\n",
|
||||
index);
|
||||
return -EINVAL;
|
||||
}
|
||||
@@ -1961,7 +1963,7 @@ again:
|
||||
if (!nbd) {
|
||||
nbd = nbd_dev_add(index, 2);
|
||||
if (IS_ERR(nbd)) {
|
||||
pr_err("nbd: failed to add new device\n");
|
||||
pr_err("failed to add new device\n");
|
||||
return PTR_ERR(nbd);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -201,6 +201,22 @@ static bool g_use_per_node_hctx;
|
||||
module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444);
|
||||
MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
|
||||
|
||||
static bool g_memory_backed;
|
||||
module_param_named(memory_backed, g_memory_backed, bool, 0444);
|
||||
MODULE_PARM_DESC(memory_backed, "Create a memory-backed block device. Default: false");
|
||||
|
||||
static bool g_discard;
|
||||
module_param_named(discard, g_discard, bool, 0444);
|
||||
MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed null_blk device). Default: false");
|
||||
|
||||
static unsigned long g_cache_size;
|
||||
module_param_named(cache_size, g_cache_size, ulong, 0444);
|
||||
MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)");
|
||||
|
||||
static unsigned int g_mbps;
|
||||
module_param_named(mbps, g_mbps, uint, 0444);
|
||||
MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)");
|
||||
|
||||
static bool g_zoned;
|
||||
module_param_named(zoned, g_zoned, bool, S_IRUGO);
|
||||
MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false");
|
||||
@@ -409,6 +425,8 @@ NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
|
||||
NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
|
||||
NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
|
||||
NULLB_DEVICE_ATTR(virt_boundary, bool, NULL);
|
||||
NULLB_DEVICE_ATTR(no_sched, bool, NULL);
|
||||
NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL);
|
||||
|
||||
static ssize_t nullb_device_power_show(struct config_item *item, char *page)
|
||||
{
|
||||
@@ -532,6 +550,8 @@ static struct configfs_attribute *nullb_device_attrs[] = {
|
||||
&nullb_device_attr_zone_max_open,
|
||||
&nullb_device_attr_zone_max_active,
|
||||
&nullb_device_attr_virt_boundary,
|
||||
&nullb_device_attr_no_sched,
|
||||
&nullb_device_attr_shared_tag_bitmap,
|
||||
NULL,
|
||||
};
|
||||
|
||||
@@ -588,7 +608,13 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)
|
||||
static ssize_t memb_group_features_show(struct config_item *item, char *page)
|
||||
{
|
||||
return snprintf(page, PAGE_SIZE,
|
||||
"memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active,blocksize,max_sectors,virt_boundary\n");
|
||||
"badblocks,blocking,blocksize,cache_size,"
|
||||
"completion_nsec,discard,home_node,hw_queue_depth,"
|
||||
"irqmode,max_sectors,mbps,memory_backed,no_sched,"
|
||||
"poll_queues,power,queue_mode,shared_tag_bitmap,size,"
|
||||
"submit_queues,use_per_node_hctx,virt_boundary,zoned,"
|
||||
"zone_capacity,zone_max_active,zone_max_open,"
|
||||
"zone_nr_conv,zone_size\n");
|
||||
}
|
||||
|
||||
CONFIGFS_ATTR_RO(memb_group_, features);
|
||||
@@ -650,6 +676,10 @@ static struct nullb_device *null_alloc_dev(void)
|
||||
dev->irqmode = g_irqmode;
|
||||
dev->hw_queue_depth = g_hw_queue_depth;
|
||||
dev->blocking = g_blocking;
|
||||
dev->memory_backed = g_memory_backed;
|
||||
dev->discard = g_discard;
|
||||
dev->cache_size = g_cache_size;
|
||||
dev->mbps = g_mbps;
|
||||
dev->use_per_node_hctx = g_use_per_node_hctx;
|
||||
dev->zoned = g_zoned;
|
||||
dev->zone_size = g_zone_size;
|
||||
@@ -658,6 +688,8 @@ static struct nullb_device *null_alloc_dev(void)
|
||||
dev->zone_max_open = g_zone_max_open;
|
||||
dev->zone_max_active = g_zone_max_active;
|
||||
dev->virt_boundary = g_virt_boundary;
|
||||
dev->no_sched = g_no_sched;
|
||||
dev->shared_tag_bitmap = g_shared_tag_bitmap;
|
||||
return dev;
|
||||
}
|
||||
|
||||
@@ -1655,7 +1687,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
|
||||
|
||||
static void cleanup_queue(struct nullb_queue *nq)
|
||||
{
|
||||
kfree(nq->tag_map);
|
||||
bitmap_free(nq->tag_map);
|
||||
kfree(nq->cmds);
|
||||
}
|
||||
|
||||
@@ -1782,14 +1814,13 @@ static const struct block_device_operations null_rq_ops = {
|
||||
static int setup_commands(struct nullb_queue *nq)
|
||||
{
|
||||
struct nullb_cmd *cmd;
|
||||
int i, tag_size;
|
||||
int i;
|
||||
|
||||
nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL);
|
||||
if (!nq->cmds)
|
||||
return -ENOMEM;
|
||||
|
||||
tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG;
|
||||
nq->tag_map = kcalloc(tag_size, sizeof(unsigned long), GFP_KERNEL);
|
||||
nq->tag_map = bitmap_zalloc(nq->queue_depth, GFP_KERNEL);
|
||||
if (!nq->tag_map) {
|
||||
kfree(nq->cmds);
|
||||
return -ENOMEM;
|
||||
@@ -1866,31 +1897,48 @@ static int null_gendisk_register(struct nullb *nullb)
|
||||
|
||||
static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
|
||||
{
|
||||
unsigned int flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
int hw_queues, numa_node;
|
||||
unsigned int queue_depth;
|
||||
int poll_queues;
|
||||
|
||||
set->ops = &null_mq_ops;
|
||||
set->nr_hw_queues = nullb ? nullb->dev->submit_queues :
|
||||
g_submit_queues;
|
||||
poll_queues = nullb ? nullb->dev->poll_queues : g_poll_queues;
|
||||
if (poll_queues)
|
||||
set->nr_hw_queues += poll_queues;
|
||||
set->queue_depth = nullb ? nullb->dev->hw_queue_depth :
|
||||
g_hw_queue_depth;
|
||||
set->numa_node = nullb ? nullb->dev->home_node : g_home_node;
|
||||
set->cmd_size = sizeof(struct nullb_cmd);
|
||||
set->flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
if (g_no_sched)
|
||||
set->flags |= BLK_MQ_F_NO_SCHED;
|
||||
if (g_shared_tag_bitmap)
|
||||
set->flags |= BLK_MQ_F_TAG_HCTX_SHARED;
|
||||
set->driver_data = nullb;
|
||||
if (poll_queues)
|
||||
set->nr_maps = 3;
|
||||
else
|
||||
set->nr_maps = 1;
|
||||
if (nullb) {
|
||||
hw_queues = nullb->dev->submit_queues;
|
||||
poll_queues = nullb->dev->poll_queues;
|
||||
queue_depth = nullb->dev->hw_queue_depth;
|
||||
numa_node = nullb->dev->home_node;
|
||||
if (nullb->dev->no_sched)
|
||||
flags |= BLK_MQ_F_NO_SCHED;
|
||||
if (nullb->dev->shared_tag_bitmap)
|
||||
flags |= BLK_MQ_F_TAG_HCTX_SHARED;
|
||||
if (nullb->dev->blocking)
|
||||
flags |= BLK_MQ_F_BLOCKING;
|
||||
} else {
|
||||
hw_queues = g_submit_queues;
|
||||
poll_queues = g_poll_queues;
|
||||
queue_depth = g_hw_queue_depth;
|
||||
numa_node = g_home_node;
|
||||
if (g_no_sched)
|
||||
flags |= BLK_MQ_F_NO_SCHED;
|
||||
if (g_shared_tag_bitmap)
|
||||
flags |= BLK_MQ_F_TAG_HCTX_SHARED;
|
||||
if (g_blocking)
|
||||
flags |= BLK_MQ_F_BLOCKING;
|
||||
}
|
||||
|
||||
if ((nullb && nullb->dev->blocking) || g_blocking)
|
||||
set->flags |= BLK_MQ_F_BLOCKING;
|
||||
set->ops = &null_mq_ops;
|
||||
set->cmd_size = sizeof(struct nullb_cmd);
|
||||
set->flags = flags;
|
||||
set->driver_data = nullb;
|
||||
set->nr_hw_queues = hw_queues;
|
||||
set->queue_depth = queue_depth;
|
||||
set->numa_node = numa_node;
|
||||
if (poll_queues) {
|
||||
set->nr_hw_queues += poll_queues;
|
||||
set->nr_maps = 3;
|
||||
} else {
|
||||
set->nr_maps = 1;
|
||||
}
|
||||
|
||||
return blk_mq_alloc_tag_set(set);
|
||||
}
|
||||
@@ -2042,8 +2090,13 @@ static int null_add_dev(struct nullb_device *dev)
|
||||
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q);
|
||||
|
||||
mutex_lock(&lock);
|
||||
nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
|
||||
dev->index = nullb->index;
|
||||
rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
|
||||
if (rv < 0) {
|
||||
mutex_unlock(&lock);
|
||||
goto out_cleanup_zone;
|
||||
}
|
||||
nullb->index = rv;
|
||||
dev->index = rv;
|
||||
mutex_unlock(&lock);
|
||||
|
||||
blk_queue_logical_block_size(nullb->q, dev->blocksize);
|
||||
@@ -2069,7 +2122,7 @@ static int null_add_dev(struct nullb_device *dev)
|
||||
|
||||
rv = null_gendisk_register(nullb);
|
||||
if (rv)
|
||||
goto out_cleanup_zone;
|
||||
goto out_ida_free;
|
||||
|
||||
mutex_lock(&lock);
|
||||
list_add_tail(&nullb->list, &nullb_list);
|
||||
@@ -2078,6 +2131,9 @@ static int null_add_dev(struct nullb_device *dev)
|
||||
pr_info("disk %s created\n", nullb->disk_name);
|
||||
|
||||
return 0;
|
||||
|
||||
out_ida_free:
|
||||
ida_free(&nullb_indexes, nullb->index);
|
||||
out_cleanup_zone:
|
||||
null_free_zoned_dev(dev);
|
||||
out_cleanup_disk:
|
||||
|
||||
@@ -113,6 +113,8 @@ struct nullb_device {
|
||||
bool discard; /* if support discard */
|
||||
bool zoned; /* if device is zoned */
|
||||
bool virt_boundary; /* virtual boundary on/off for the device */
|
||||
bool no_sched; /* no IO scheduler for the device */
|
||||
bool shared_tag_bitmap; /* use hostwide shared tags */
|
||||
};
|
||||
|
||||
struct nullb {
|
||||
|
||||
@@ -2399,7 +2399,7 @@ static void pkt_submit_bio(struct bio *bio)
|
||||
struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata;
|
||||
struct bio *split;
|
||||
|
||||
blk_queue_split(&bio);
|
||||
bio = bio_split_to_limits(bio);
|
||||
|
||||
pkt_dbg(2, pd, "start = %6llx stop = %6llx\n",
|
||||
(unsigned long long)bio->bi_iter.bi_sector,
|
||||
|
||||
@@ -586,7 +586,7 @@ static void ps3vram_submit_bio(struct bio *bio)
|
||||
|
||||
dev_dbg(&dev->core, "%s\n", __func__);
|
||||
|
||||
blk_queue_split(&bio);
|
||||
bio = bio_split_to_limits(bio);
|
||||
|
||||
spin_lock_irq(&priv->lock);
|
||||
busy = !bio_list_empty(&priv->list);
|
||||
|
||||
@@ -376,7 +376,7 @@ static ssize_t rnbd_clt_resize_dev_store(struct kobject *kobj,
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = rnbd_clt_resize_disk(dev, (size_t)sectors);
|
||||
ret = rnbd_clt_resize_disk(dev, sectors);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
|
||||
@@ -68,39 +68,18 @@ static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev)
|
||||
return refcount_inc_not_zero(&dev->refcount);
|
||||
}
|
||||
|
||||
static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev,
|
||||
const struct rnbd_msg_open_rsp *rsp)
|
||||
static void rnbd_clt_change_capacity(struct rnbd_clt_dev *dev,
|
||||
sector_t new_nsectors)
|
||||
{
|
||||
struct rnbd_clt_session *sess = dev->sess;
|
||||
if (get_capacity(dev->gd) == new_nsectors)
|
||||
return;
|
||||
|
||||
if (!rsp->logical_block_size)
|
||||
return -EINVAL;
|
||||
|
||||
dev->device_id = le32_to_cpu(rsp->device_id);
|
||||
dev->nsectors = le64_to_cpu(rsp->nsectors);
|
||||
dev->logical_block_size = le16_to_cpu(rsp->logical_block_size);
|
||||
dev->physical_block_size = le16_to_cpu(rsp->physical_block_size);
|
||||
dev->max_discard_sectors = le32_to_cpu(rsp->max_discard_sectors);
|
||||
dev->discard_granularity = le32_to_cpu(rsp->discard_granularity);
|
||||
dev->discard_alignment = le32_to_cpu(rsp->discard_alignment);
|
||||
dev->secure_discard = le16_to_cpu(rsp->secure_discard);
|
||||
dev->wc = !!(rsp->cache_policy & RNBD_WRITEBACK);
|
||||
dev->fua = !!(rsp->cache_policy & RNBD_FUA);
|
||||
|
||||
dev->max_hw_sectors = sess->max_io_size / SECTOR_SIZE;
|
||||
dev->max_segments = sess->max_segments;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev,
|
||||
size_t new_nsectors)
|
||||
{
|
||||
rnbd_clt_info(dev, "Device size changed from %zu to %zu sectors\n",
|
||||
dev->nsectors, new_nsectors);
|
||||
dev->nsectors = new_nsectors;
|
||||
set_capacity_and_notify(dev->gd, dev->nsectors);
|
||||
return 0;
|
||||
/*
|
||||
* If the size changed, we need to revalidate it
|
||||
*/
|
||||
rnbd_clt_info(dev, "Device size changed from %llu to %llu sectors\n",
|
||||
get_capacity(dev->gd), new_nsectors);
|
||||
set_capacity_and_notify(dev->gd, new_nsectors);
|
||||
}
|
||||
|
||||
static int process_msg_open_rsp(struct rnbd_clt_dev *dev,
|
||||
@@ -119,19 +98,16 @@ static int process_msg_open_rsp(struct rnbd_clt_dev *dev,
|
||||
if (dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED) {
|
||||
u64 nsectors = le64_to_cpu(rsp->nsectors);
|
||||
|
||||
/*
|
||||
* If the device was remapped and the size changed in the
|
||||
* meantime we need to revalidate it
|
||||
*/
|
||||
if (dev->nsectors != nsectors)
|
||||
rnbd_clt_change_capacity(dev, nsectors);
|
||||
rnbd_clt_change_capacity(dev, nsectors);
|
||||
gd_kobj = &disk_to_dev(dev->gd)->kobj;
|
||||
kobject_uevent(gd_kobj, KOBJ_ONLINE);
|
||||
rnbd_clt_info(dev, "Device online, device remapped successfully\n");
|
||||
}
|
||||
err = rnbd_clt_set_dev_attr(dev, rsp);
|
||||
if (err)
|
||||
if (!rsp->logical_block_size) {
|
||||
err = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
dev->device_id = le32_to_cpu(rsp->device_id);
|
||||
dev->dev_state = DEV_STATE_MAPPED;
|
||||
|
||||
out:
|
||||
@@ -140,7 +116,7 @@ out:
|
||||
return err;
|
||||
}
|
||||
|
||||
int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize)
|
||||
int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, sector_t newsize)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
@@ -150,7 +126,7 @@ int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize)
|
||||
ret = -ENOENT;
|
||||
goto out;
|
||||
}
|
||||
ret = rnbd_clt_change_capacity(dev, newsize);
|
||||
rnbd_clt_change_capacity(dev, newsize);
|
||||
|
||||
out:
|
||||
mutex_unlock(&dev->lock);
|
||||
@@ -507,6 +483,11 @@ static void msg_open_conf(struct work_struct *work)
|
||||
struct rnbd_msg_open_rsp *rsp = iu->buf;
|
||||
struct rnbd_clt_dev *dev = iu->dev;
|
||||
int errno = iu->errno;
|
||||
bool from_map = false;
|
||||
|
||||
/* INIT state is only triggered from rnbd_clt_map_device */
|
||||
if (dev->dev_state == DEV_STATE_INIT)
|
||||
from_map = true;
|
||||
|
||||
if (errno) {
|
||||
rnbd_clt_err(dev,
|
||||
@@ -523,7 +504,9 @@ static void msg_open_conf(struct work_struct *work)
|
||||
send_msg_close(dev, device_id, RTRS_PERMIT_NOWAIT);
|
||||
}
|
||||
}
|
||||
kfree(rsp);
|
||||
/* We free rsp in rnbd_clt_map_device for map scenario */
|
||||
if (!from_map)
|
||||
kfree(rsp);
|
||||
wake_up_iu_comp(iu, errno);
|
||||
rnbd_put_iu(dev->sess, iu);
|
||||
rnbd_clt_put_dev(dev);
|
||||
@@ -942,7 +925,7 @@ static int rnbd_client_open(struct block_device *block_device, fmode_t mode)
|
||||
{
|
||||
struct rnbd_clt_dev *dev = block_device->bd_disk->private_data;
|
||||
|
||||
if (dev->read_only && (mode & FMODE_WRITE))
|
||||
if (get_disk_ro(dev->gd) && (mode & FMODE_WRITE))
|
||||
return -EPERM;
|
||||
|
||||
if (dev->dev_state == DEV_STATE_UNMAPPED ||
|
||||
@@ -963,10 +946,10 @@ static int rnbd_client_getgeo(struct block_device *block_device,
|
||||
struct hd_geometry *geo)
|
||||
{
|
||||
u64 size;
|
||||
struct rnbd_clt_dev *dev;
|
||||
struct rnbd_clt_dev *dev = block_device->bd_disk->private_data;
|
||||
struct queue_limits *limit = &dev->queue->limits;
|
||||
|
||||
dev = block_device->bd_disk->private_data;
|
||||
size = dev->size * (dev->logical_block_size / SECTOR_SIZE);
|
||||
size = dev->size * (limit->logical_block_size / SECTOR_SIZE);
|
||||
geo->cylinders = size >> 6; /* size/64 */
|
||||
geo->heads = 4;
|
||||
geo->sectors = 16;
|
||||
@@ -1350,11 +1333,15 @@ static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev)
|
||||
}
|
||||
}
|
||||
|
||||
static void setup_request_queue(struct rnbd_clt_dev *dev)
|
||||
static void setup_request_queue(struct rnbd_clt_dev *dev,
|
||||
struct rnbd_msg_open_rsp *rsp)
|
||||
{
|
||||
blk_queue_logical_block_size(dev->queue, dev->logical_block_size);
|
||||
blk_queue_physical_block_size(dev->queue, dev->physical_block_size);
|
||||
blk_queue_max_hw_sectors(dev->queue, dev->max_hw_sectors);
|
||||
blk_queue_logical_block_size(dev->queue,
|
||||
le16_to_cpu(rsp->logical_block_size));
|
||||
blk_queue_physical_block_size(dev->queue,
|
||||
le16_to_cpu(rsp->physical_block_size));
|
||||
blk_queue_max_hw_sectors(dev->queue,
|
||||
dev->sess->max_io_size / SECTOR_SIZE);
|
||||
|
||||
/*
|
||||
* we don't support discards to "discontiguous" segments
|
||||
@@ -1362,21 +1349,27 @@ static void setup_request_queue(struct rnbd_clt_dev *dev)
|
||||
*/
|
||||
blk_queue_max_discard_segments(dev->queue, 1);
|
||||
|
||||
blk_queue_max_discard_sectors(dev->queue, dev->max_discard_sectors);
|
||||
dev->queue->limits.discard_granularity = dev->discard_granularity;
|
||||
dev->queue->limits.discard_alignment = dev->discard_alignment;
|
||||
if (dev->secure_discard)
|
||||
blk_queue_max_discard_sectors(dev->queue,
|
||||
le32_to_cpu(rsp->max_discard_sectors));
|
||||
dev->queue->limits.discard_granularity =
|
||||
le32_to_cpu(rsp->discard_granularity);
|
||||
dev->queue->limits.discard_alignment =
|
||||
le32_to_cpu(rsp->discard_alignment);
|
||||
if (le16_to_cpu(rsp->secure_discard))
|
||||
blk_queue_max_secure_erase_sectors(dev->queue,
|
||||
dev->max_discard_sectors);
|
||||
le32_to_cpu(rsp->max_discard_sectors));
|
||||
blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue);
|
||||
blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue);
|
||||
blk_queue_max_segments(dev->queue, dev->max_segments);
|
||||
blk_queue_max_segments(dev->queue, dev->sess->max_segments);
|
||||
blk_queue_io_opt(dev->queue, dev->sess->max_io_size);
|
||||
blk_queue_virt_boundary(dev->queue, SZ_4K - 1);
|
||||
blk_queue_write_cache(dev->queue, dev->wc, dev->fua);
|
||||
blk_queue_write_cache(dev->queue,
|
||||
!!(rsp->cache_policy & RNBD_WRITEBACK),
|
||||
!!(rsp->cache_policy & RNBD_FUA));
|
||||
}
|
||||
|
||||
static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
|
||||
static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev,
|
||||
struct rnbd_msg_open_rsp *rsp, int idx)
|
||||
{
|
||||
int err;
|
||||
|
||||
@@ -1388,19 +1381,15 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
|
||||
dev->gd->private_data = dev;
|
||||
snprintf(dev->gd->disk_name, sizeof(dev->gd->disk_name), "rnbd%d",
|
||||
idx);
|
||||
pr_debug("disk_name=%s, capacity=%zu\n",
|
||||
pr_debug("disk_name=%s, capacity=%llu\n",
|
||||
dev->gd->disk_name,
|
||||
dev->nsectors * (dev->logical_block_size / SECTOR_SIZE)
|
||||
);
|
||||
le64_to_cpu(rsp->nsectors) *
|
||||
(le16_to_cpu(rsp->logical_block_size) / SECTOR_SIZE));
|
||||
|
||||
set_capacity(dev->gd, dev->nsectors);
|
||||
set_capacity(dev->gd, le64_to_cpu(rsp->nsectors));
|
||||
|
||||
if (dev->access_mode == RNBD_ACCESS_RO) {
|
||||
dev->read_only = true;
|
||||
if (dev->access_mode == RNBD_ACCESS_RO)
|
||||
set_disk_ro(dev->gd, true);
|
||||
} else {
|
||||
dev->read_only = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Network device does not need rotational
|
||||
@@ -1413,11 +1402,13 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
|
||||
return err;
|
||||
}
|
||||
|
||||
static int rnbd_client_setup_device(struct rnbd_clt_dev *dev)
|
||||
static int rnbd_client_setup_device(struct rnbd_clt_dev *dev,
|
||||
struct rnbd_msg_open_rsp *rsp)
|
||||
{
|
||||
int idx = dev->clt_device_id;
|
||||
|
||||
dev->size = dev->nsectors * dev->logical_block_size;
|
||||
dev->size = le64_to_cpu(rsp->nsectors) *
|
||||
le16_to_cpu(rsp->logical_block_size);
|
||||
|
||||
dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, dev);
|
||||
if (IS_ERR(dev->gd))
|
||||
@@ -1425,8 +1416,8 @@ static int rnbd_client_setup_device(struct rnbd_clt_dev *dev)
|
||||
dev->queue = dev->gd->queue;
|
||||
rnbd_init_mq_hw_queues(dev);
|
||||
|
||||
setup_request_queue(dev);
|
||||
return rnbd_clt_setup_gen_disk(dev, idx);
|
||||
setup_request_queue(dev, rsp);
|
||||
return rnbd_clt_setup_gen_disk(dev, rsp, idx);
|
||||
}
|
||||
|
||||
static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess,
|
||||
@@ -1562,7 +1553,14 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
|
||||
{
|
||||
struct rnbd_clt_session *sess;
|
||||
struct rnbd_clt_dev *dev;
|
||||
int ret;
|
||||
int ret, errno;
|
||||
struct rnbd_msg_open_rsp *rsp;
|
||||
struct rnbd_msg_open msg;
|
||||
struct rnbd_iu *iu;
|
||||
struct kvec vec = {
|
||||
.iov_base = &msg,
|
||||
.iov_len = sizeof(msg)
|
||||
};
|
||||
|
||||
if (exists_devpath(pathname, sessname))
|
||||
return ERR_PTR(-EEXIST);
|
||||
@@ -1582,17 +1580,47 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
|
||||
ret = -EEXIST;
|
||||
goto put_dev;
|
||||
}
|
||||
ret = send_msg_open(dev, RTRS_PERMIT_WAIT);
|
||||
|
||||
rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
|
||||
if (!rsp) {
|
||||
ret = -ENOMEM;
|
||||
goto del_dev;
|
||||
}
|
||||
|
||||
iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT);
|
||||
if (!iu) {
|
||||
ret = -ENOMEM;
|
||||
kfree(rsp);
|
||||
goto del_dev;
|
||||
}
|
||||
iu->buf = rsp;
|
||||
iu->dev = dev;
|
||||
sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp));
|
||||
|
||||
msg.hdr.type = cpu_to_le16(RNBD_MSG_OPEN);
|
||||
msg.access_mode = dev->access_mode;
|
||||
strscpy(msg.dev_name, dev->pathname, sizeof(msg.dev_name));
|
||||
|
||||
WARN_ON(!rnbd_clt_get_dev(dev));
|
||||
ret = send_usr_msg(sess->rtrs, READ, iu,
|
||||
&vec, sizeof(*rsp), iu->sgt.sgl, 1,
|
||||
msg_open_conf, &errno, RTRS_PERMIT_WAIT);
|
||||
if (ret) {
|
||||
rnbd_clt_put_dev(dev);
|
||||
rnbd_put_iu(sess, iu);
|
||||
} else {
|
||||
ret = errno;
|
||||
}
|
||||
if (ret) {
|
||||
rnbd_clt_err(dev,
|
||||
"map_device: failed, can't open remote device, err: %d\n",
|
||||
ret);
|
||||
goto del_dev;
|
||||
goto put_iu;
|
||||
}
|
||||
mutex_lock(&dev->lock);
|
||||
pr_debug("Opened remote device: session=%s, path='%s'\n",
|
||||
sess->sessname, pathname);
|
||||
ret = rnbd_client_setup_device(dev);
|
||||
ret = rnbd_client_setup_device(dev, rsp);
|
||||
if (ret) {
|
||||
rnbd_clt_err(dev,
|
||||
"map_device: Failed to configure device, err: %d\n",
|
||||
@@ -1602,21 +1630,30 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
|
||||
}
|
||||
|
||||
rnbd_clt_info(dev,
|
||||
"map_device: Device mapped as %s (nsectors: %zu, logical_block_size: %d, physical_block_size: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, wc: %d, fua: %d)\n",
|
||||
dev->gd->disk_name, dev->nsectors,
|
||||
dev->logical_block_size, dev->physical_block_size,
|
||||
dev->max_discard_sectors,
|
||||
dev->discard_granularity, dev->discard_alignment,
|
||||
dev->secure_discard, dev->max_segments,
|
||||
dev->max_hw_sectors, dev->wc, dev->fua);
|
||||
"map_device: Device mapped as %s (nsectors: %llu, logical_block_size: %d, physical_block_size: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, wc: %d, fua: %d)\n",
|
||||
dev->gd->disk_name, le64_to_cpu(rsp->nsectors),
|
||||
le16_to_cpu(rsp->logical_block_size),
|
||||
le16_to_cpu(rsp->physical_block_size),
|
||||
le32_to_cpu(rsp->max_discard_sectors),
|
||||
le32_to_cpu(rsp->discard_granularity),
|
||||
le32_to_cpu(rsp->discard_alignment),
|
||||
le16_to_cpu(rsp->secure_discard),
|
||||
sess->max_segments, sess->max_io_size / SECTOR_SIZE,
|
||||
!!(rsp->cache_policy & RNBD_WRITEBACK),
|
||||
!!(rsp->cache_policy & RNBD_FUA));
|
||||
|
||||
mutex_unlock(&dev->lock);
|
||||
kfree(rsp);
|
||||
rnbd_put_iu(sess, iu);
|
||||
rnbd_clt_put_sess(sess);
|
||||
|
||||
return dev;
|
||||
|
||||
send_close:
|
||||
send_msg_close(dev, dev->device_id, RTRS_PERMIT_WAIT);
|
||||
put_iu:
|
||||
kfree(rsp);
|
||||
rnbd_put_iu(sess, iu);
|
||||
del_dev:
|
||||
delete_dev(dev);
|
||||
put_dev:
|
||||
|
||||
@@ -106,6 +106,7 @@ struct rnbd_queue {
|
||||
};
|
||||
|
||||
struct rnbd_clt_dev {
|
||||
struct kobject kobj;
|
||||
struct rnbd_clt_session *sess;
|
||||
struct request_queue *queue;
|
||||
struct rnbd_queue *hw_queues;
|
||||
@@ -114,27 +115,14 @@ struct rnbd_clt_dev {
|
||||
u32 clt_device_id;
|
||||
struct mutex lock;
|
||||
enum rnbd_clt_dev_state dev_state;
|
||||
refcount_t refcount;
|
||||
char *pathname;
|
||||
enum rnbd_access_mode access_mode;
|
||||
u32 nr_poll_queues;
|
||||
bool read_only;
|
||||
bool wc;
|
||||
bool fua;
|
||||
u32 max_hw_sectors;
|
||||
u32 max_discard_sectors;
|
||||
u32 discard_granularity;
|
||||
u32 discard_alignment;
|
||||
u16 secure_discard;
|
||||
u16 physical_block_size;
|
||||
u16 logical_block_size;
|
||||
u16 max_segments;
|
||||
size_t nsectors;
|
||||
u64 size; /* device size in bytes */
|
||||
struct list_head list;
|
||||
struct gendisk *gd;
|
||||
struct kobject kobj;
|
||||
char *blk_symlink_name;
|
||||
refcount_t refcount;
|
||||
struct work_struct unmap_on_rmmod_work;
|
||||
};
|
||||
|
||||
@@ -150,7 +138,7 @@ int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force,
|
||||
const struct attribute *sysfs_self);
|
||||
|
||||
int rnbd_clt_remap_device(struct rnbd_clt_dev *dev);
|
||||
int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize);
|
||||
int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, sector_t newsize);
|
||||
|
||||
/* rnbd-clt-sysfs.c */
|
||||
|
||||
|
||||
@@ -224,7 +224,6 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id)
|
||||
wait_for_completion(&dc); /* wait for inflights to drop to zero */
|
||||
|
||||
rnbd_dev_close(sess_dev->rnbd_dev);
|
||||
list_del(&sess_dev->sess_list);
|
||||
mutex_lock(&sess_dev->dev->lock);
|
||||
list_del(&sess_dev->dev_list);
|
||||
if (sess_dev->open_flags & FMODE_WRITE)
|
||||
@@ -239,14 +238,14 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id)
|
||||
|
||||
static void destroy_sess(struct rnbd_srv_session *srv_sess)
|
||||
{
|
||||
struct rnbd_srv_sess_dev *sess_dev, *tmp;
|
||||
struct rnbd_srv_sess_dev *sess_dev;
|
||||
unsigned long index;
|
||||
|
||||
if (list_empty(&srv_sess->sess_dev_list))
|
||||
if (xa_empty(&srv_sess->index_idr))
|
||||
goto out;
|
||||
|
||||
mutex_lock(&srv_sess->lock);
|
||||
list_for_each_entry_safe(sess_dev, tmp, &srv_sess->sess_dev_list,
|
||||
sess_list)
|
||||
xa_for_each(&srv_sess->index_idr, index, sess_dev)
|
||||
rnbd_srv_destroy_dev_session_sysfs(sess_dev);
|
||||
mutex_unlock(&srv_sess->lock);
|
||||
|
||||
@@ -281,7 +280,6 @@ static int create_sess(struct rtrs_srv_sess *rtrs)
|
||||
|
||||
srv_sess->queue_depth = rtrs_srv_get_queue_depth(rtrs);
|
||||
xa_init_flags(&srv_sess->index_idr, XA_FLAGS_ALLOC);
|
||||
INIT_LIST_HEAD(&srv_sess->sess_dev_list);
|
||||
mutex_init(&srv_sess->lock);
|
||||
mutex_lock(&sess_lock);
|
||||
list_add(&srv_sess->list, &sess_list);
|
||||
@@ -323,10 +321,11 @@ void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev,
|
||||
{
|
||||
struct rnbd_srv_session *sess = sess_dev->sess;
|
||||
|
||||
sess_dev->keep_id = true;
|
||||
/* It is already started to close by client's close message. */
|
||||
if (!mutex_trylock(&sess->lock))
|
||||
return;
|
||||
|
||||
sess_dev->keep_id = true;
|
||||
/* first remove sysfs itself to avoid deadlock */
|
||||
sysfs_remove_file_self(&sess_dev->kobj, &attr->attr);
|
||||
rnbd_srv_destroy_dev_session_sysfs(sess_dev);
|
||||
@@ -666,11 +665,12 @@ static struct rnbd_srv_sess_dev *
|
||||
find_srv_sess_dev(struct rnbd_srv_session *srv_sess, const char *dev_name)
|
||||
{
|
||||
struct rnbd_srv_sess_dev *sess_dev;
|
||||
unsigned long index;
|
||||
|
||||
if (list_empty(&srv_sess->sess_dev_list))
|
||||
if (xa_empty(&srv_sess->index_idr))
|
||||
return NULL;
|
||||
|
||||
list_for_each_entry(sess_dev, &srv_sess->sess_dev_list, sess_list)
|
||||
xa_for_each(&srv_sess->index_idr, index, sess_dev)
|
||||
if (!strcmp(sess_dev->pathname, dev_name))
|
||||
return sess_dev;
|
||||
|
||||
@@ -780,8 +780,6 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
|
||||
list_add(&srv_sess_dev->dev_list, &srv_dev->sess_dev_list);
|
||||
mutex_unlock(&srv_dev->lock);
|
||||
|
||||
list_add(&srv_sess_dev->sess_list, &srv_sess->sess_dev_list);
|
||||
|
||||
rnbd_srv_info(srv_sess_dev, "Opened device '%s'\n", srv_dev->id);
|
||||
|
||||
kfree(full_path);
|
||||
|
||||
@@ -25,8 +25,6 @@ struct rnbd_srv_session {
|
||||
int queue_depth;
|
||||
|
||||
struct xarray index_idr;
|
||||
/* List of struct rnbd_srv_sess_dev */
|
||||
struct list_head sess_dev_list;
|
||||
struct mutex lock;
|
||||
u8 ver;
|
||||
};
|
||||
@@ -48,8 +46,6 @@ struct rnbd_srv_dev {
|
||||
struct rnbd_srv_sess_dev {
|
||||
/* Entry inside rnbd_srv_dev struct */
|
||||
struct list_head dev_list;
|
||||
/* Entry inside rnbd_srv_session struct */
|
||||
struct list_head sess_list;
|
||||
struct rnbd_dev *rnbd_dev;
|
||||
struct rnbd_srv_session *sess;
|
||||
struct rnbd_srv_dev *dev;
|
||||
|
||||
1582
drivers/block/sx8.c
1582
drivers/block/sx8.c
File diff suppressed because it is too large
Load Diff
@@ -47,7 +47,12 @@
|
||||
#define UBLK_MINORS (1U << MINORBITS)
|
||||
|
||||
/* All UBLK_F_* have to be included into UBLK_F_ALL */
|
||||
#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_URING_CMD_COMP_IN_TASK)
|
||||
#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
|
||||
| UBLK_F_URING_CMD_COMP_IN_TASK \
|
||||
| UBLK_F_NEED_GET_DATA)
|
||||
|
||||
/* All UBLK_PARAM_TYPE_* should be included here */
|
||||
#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
|
||||
|
||||
struct ublk_rq_data {
|
||||
struct callback_head work;
|
||||
@@ -86,6 +91,15 @@ struct ublk_uring_cmd_pdu {
|
||||
*/
|
||||
#define UBLK_IO_FLAG_ABORTED 0x04
|
||||
|
||||
/*
|
||||
* UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
|
||||
* get data buffer address from ublksrv.
|
||||
*
|
||||
* Then, bio data could be copied into this data buffer for a WRITE request
|
||||
* after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
|
||||
*/
|
||||
#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
|
||||
|
||||
struct ublk_io {
|
||||
/* userspace buffer address from io cmd */
|
||||
__u64 addr;
|
||||
@@ -119,7 +133,6 @@ struct ublk_device {
|
||||
char *__queues;
|
||||
|
||||
unsigned short queue_size;
|
||||
unsigned short bs_shift;
|
||||
struct ublksrv_ctrl_dev_info dev_info;
|
||||
|
||||
struct blk_mq_tag_set tag_set;
|
||||
@@ -137,6 +150,8 @@ struct ublk_device {
|
||||
spinlock_t mm_lock;
|
||||
struct mm_struct *mm;
|
||||
|
||||
struct ublk_params params;
|
||||
|
||||
struct completion completion;
|
||||
unsigned int nr_queues_ready;
|
||||
atomic_t nr_aborted_queues;
|
||||
@@ -149,6 +164,12 @@ struct ublk_device {
|
||||
struct work_struct stop_work;
|
||||
};
|
||||
|
||||
/* header of ublk_params */
|
||||
struct ublk_params_header {
|
||||
__u32 len;
|
||||
__u32 types;
|
||||
};
|
||||
|
||||
static dev_t ublk_chr_devt;
|
||||
static struct class *ublk_chr_class;
|
||||
|
||||
@@ -160,6 +181,90 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
|
||||
|
||||
static struct miscdevice ublk_misc;
|
||||
|
||||
static void ublk_dev_param_basic_apply(struct ublk_device *ub)
|
||||
{
|
||||
struct request_queue *q = ub->ub_disk->queue;
|
||||
const struct ublk_param_basic *p = &ub->params.basic;
|
||||
|
||||
blk_queue_logical_block_size(q, 1 << p->logical_bs_shift);
|
||||
blk_queue_physical_block_size(q, 1 << p->physical_bs_shift);
|
||||
blk_queue_io_min(q, 1 << p->io_min_shift);
|
||||
blk_queue_io_opt(q, 1 << p->io_opt_shift);
|
||||
|
||||
blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE,
|
||||
p->attrs & UBLK_ATTR_FUA);
|
||||
if (p->attrs & UBLK_ATTR_ROTATIONAL)
|
||||
blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
|
||||
else
|
||||
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
|
||||
|
||||
blk_queue_max_hw_sectors(q, p->max_sectors);
|
||||
blk_queue_chunk_sectors(q, p->chunk_sectors);
|
||||
blk_queue_virt_boundary(q, p->virt_boundary_mask);
|
||||
|
||||
if (p->attrs & UBLK_ATTR_READ_ONLY)
|
||||
set_disk_ro(ub->ub_disk, true);
|
||||
|
||||
set_capacity(ub->ub_disk, p->dev_sectors);
|
||||
}
|
||||
|
||||
static void ublk_dev_param_discard_apply(struct ublk_device *ub)
|
||||
{
|
||||
struct request_queue *q = ub->ub_disk->queue;
|
||||
const struct ublk_param_discard *p = &ub->params.discard;
|
||||
|
||||
q->limits.discard_alignment = p->discard_alignment;
|
||||
q->limits.discard_granularity = p->discard_granularity;
|
||||
blk_queue_max_discard_sectors(q, p->max_discard_sectors);
|
||||
blk_queue_max_write_zeroes_sectors(q,
|
||||
p->max_write_zeroes_sectors);
|
||||
blk_queue_max_discard_segments(q, p->max_discard_segments);
|
||||
}
|
||||
|
||||
static int ublk_validate_params(const struct ublk_device *ub)
|
||||
{
|
||||
/* basic param is the only one which must be set */
|
||||
if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
|
||||
const struct ublk_param_basic *p = &ub->params.basic;
|
||||
|
||||
if (p->logical_bs_shift > PAGE_SHIFT)
|
||||
return -EINVAL;
|
||||
|
||||
if (p->logical_bs_shift > p->physical_bs_shift)
|
||||
return -EINVAL;
|
||||
|
||||
if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
|
||||
return -EINVAL;
|
||||
} else
|
||||
return -EINVAL;
|
||||
|
||||
if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
|
||||
const struct ublk_param_discard *p = &ub->params.discard;
|
||||
|
||||
/* So far, only support single segment discard */
|
||||
if (p->max_discard_sectors && p->max_discard_segments != 1)
|
||||
return -EINVAL;
|
||||
|
||||
if (!p->discard_granularity)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ublk_apply_params(struct ublk_device *ub)
|
||||
{
|
||||
if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
|
||||
return -EINVAL;
|
||||
|
||||
ublk_dev_param_basic_apply(ub);
|
||||
|
||||
if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
|
||||
ublk_dev_param_discard_apply(ub);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
|
||||
{
|
||||
if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
|
||||
@@ -168,6 +273,13 @@ static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
|
||||
{
|
||||
if (ubq->flags & UBLK_F_NEED_GET_DATA)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static struct ublk_device *ublk_get_device(struct ublk_device *ub)
|
||||
{
|
||||
if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
|
||||
@@ -509,6 +621,21 @@ static void __ublk_fail_req(struct ublk_io *io, struct request *req)
|
||||
}
|
||||
}
|
||||
|
||||
static void ubq_complete_io_cmd(struct ublk_io *io, int res)
|
||||
{
|
||||
/* mark this cmd owned by ublksrv */
|
||||
io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
|
||||
|
||||
/*
|
||||
* clear ACTIVE since we are done with this sqe/cmd slot
|
||||
* We can only accept io cmd in case of being not active.
|
||||
*/
|
||||
io->flags &= ~UBLK_IO_FLAG_ACTIVE;
|
||||
|
||||
/* tell ublksrv one io request is coming */
|
||||
io_uring_cmd_done(io->cmd, res, 0);
|
||||
}
|
||||
|
||||
#define UBLK_REQUEUE_DELAY_MS 3
|
||||
|
||||
static inline void __ublk_rq_task_work(struct request *req)
|
||||
@@ -531,6 +658,30 @@ static inline void __ublk_rq_task_work(struct request *req)
|
||||
return;
|
||||
}
|
||||
|
||||
if (ublk_need_get_data(ubq) &&
|
||||
(req_op(req) == REQ_OP_WRITE ||
|
||||
req_op(req) == REQ_OP_FLUSH)) {
|
||||
/*
|
||||
* We have not handled UBLK_IO_NEED_GET_DATA command yet,
|
||||
* so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
|
||||
* and notify it.
|
||||
*/
|
||||
if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
|
||||
io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
|
||||
pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
|
||||
__func__, io->cmd->cmd_op, ubq->q_id,
|
||||
req->tag, io->flags);
|
||||
ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA);
|
||||
return;
|
||||
}
|
||||
/*
|
||||
* We have handled UBLK_IO_NEED_GET_DATA command,
|
||||
* so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
|
||||
* do the copy work.
|
||||
*/
|
||||
io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
|
||||
}
|
||||
|
||||
mapped_bytes = ublk_map_io(ubq, req, io);
|
||||
|
||||
/* partially mapped, update io descriptor */
|
||||
@@ -553,17 +704,7 @@ static inline void __ublk_rq_task_work(struct request *req)
|
||||
mapped_bytes >> 9;
|
||||
}
|
||||
|
||||
/* mark this cmd owned by ublksrv */
|
||||
io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
|
||||
|
||||
/*
|
||||
* clear ACTIVE since we are done with this sqe/cmd slot
|
||||
* We can only accept io cmd in case of being not active.
|
||||
*/
|
||||
io->flags &= ~UBLK_IO_FLAG_ACTIVE;
|
||||
|
||||
/* tell ublksrv one io request is coming */
|
||||
io_uring_cmd_done(io->cmd, UBLK_IO_RES_OK, 0);
|
||||
ubq_complete_io_cmd(io, UBLK_IO_RES_OK);
|
||||
}
|
||||
|
||||
static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
|
||||
@@ -788,16 +929,27 @@ static void ublk_daemon_monitor_work(struct work_struct *work)
|
||||
UBLK_DAEMON_MONITOR_PERIOD);
|
||||
}
|
||||
|
||||
static inline bool ublk_queue_ready(struct ublk_queue *ubq)
|
||||
{
|
||||
return ubq->nr_io_ready == ubq->q_depth;
|
||||
}
|
||||
|
||||
static void ublk_cancel_queue(struct ublk_queue *ubq)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (!ublk_queue_ready(ubq))
|
||||
return;
|
||||
|
||||
for (i = 0; i < ubq->q_depth; i++) {
|
||||
struct ublk_io *io = &ubq->ios[i];
|
||||
|
||||
if (io->flags & UBLK_IO_FLAG_ACTIVE)
|
||||
io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0);
|
||||
}
|
||||
|
||||
/* all io commands are canceled */
|
||||
ubq->nr_io_ready = 0;
|
||||
}
|
||||
|
||||
/* Cancel all pending commands, must be called after del_gendisk() returns */
|
||||
@@ -818,19 +970,14 @@ static void ublk_stop_dev(struct ublk_device *ub)
|
||||
del_gendisk(ub->ub_disk);
|
||||
ub->dev_info.state = UBLK_S_DEV_DEAD;
|
||||
ub->dev_info.ublksrv_pid = -1;
|
||||
ublk_cancel_dev(ub);
|
||||
put_disk(ub->ub_disk);
|
||||
ub->ub_disk = NULL;
|
||||
unlock:
|
||||
ublk_cancel_dev(ub);
|
||||
mutex_unlock(&ub->mutex);
|
||||
cancel_delayed_work_sync(&ub->monitor_work);
|
||||
}
|
||||
|
||||
static inline bool ublk_queue_ready(struct ublk_queue *ubq)
|
||||
{
|
||||
return ubq->nr_io_ready == ubq->q_depth;
|
||||
}
|
||||
|
||||
/* device can only be started after all IOs are ready */
|
||||
static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
|
||||
{
|
||||
@@ -846,6 +993,25 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
|
||||
mutex_unlock(&ub->mutex);
|
||||
}
|
||||
|
||||
static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
|
||||
int tag, struct io_uring_cmd *cmd)
|
||||
{
|
||||
struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
|
||||
struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
|
||||
|
||||
if (ublk_can_use_task_work(ubq)) {
|
||||
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
|
||||
|
||||
/* should not fail since we call it just in ubq->ubq_daemon */
|
||||
task_work_add(ubq->ubq_daemon, &data->work, TWA_SIGNAL_NO_IPI);
|
||||
} else {
|
||||
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
|
||||
|
||||
pdu->req = req;
|
||||
io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
|
||||
}
|
||||
}
|
||||
|
||||
static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
|
||||
{
|
||||
struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd;
|
||||
@@ -884,6 +1050,14 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* ensure that the user issues UBLK_IO_NEED_GET_DATA
|
||||
* iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
|
||||
*/
|
||||
if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
|
||||
^ (cmd_op == UBLK_IO_NEED_GET_DATA))
|
||||
goto out;
|
||||
|
||||
switch (cmd_op) {
|
||||
case UBLK_IO_FETCH_REQ:
|
||||
/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
|
||||
@@ -917,6 +1091,14 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
|
||||
io->cmd = cmd;
|
||||
ublk_commit_completion(ub, ub_cmd);
|
||||
break;
|
||||
case UBLK_IO_NEED_GET_DATA:
|
||||
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
|
||||
goto out;
|
||||
io->addr = ub_cmd->addr;
|
||||
io->cmd = cmd;
|
||||
io->flags |= UBLK_IO_FLAG_ACTIVE;
|
||||
ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag, cmd);
|
||||
break;
|
||||
default:
|
||||
goto out;
|
||||
}
|
||||
@@ -1083,13 +1265,13 @@ static void ublk_stop_work_fn(struct work_struct *work)
|
||||
ublk_stop_dev(ub);
|
||||
}
|
||||
|
||||
/* align maximum I/O size to PAGE_SIZE */
|
||||
/* align max io buffer size with PAGE_SIZE */
|
||||
static void ublk_align_max_io_size(struct ublk_device *ub)
|
||||
{
|
||||
unsigned int max_rq_bytes = ub->dev_info.rq_max_blocks << ub->bs_shift;
|
||||
unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
|
||||
|
||||
ub->dev_info.rq_max_blocks =
|
||||
round_down(max_rq_bytes, PAGE_SIZE) >> ub->bs_shift;
|
||||
ub->dev_info.max_io_buf_bytes =
|
||||
round_down(max_io_bytes, PAGE_SIZE);
|
||||
}
|
||||
|
||||
static int ublk_add_tag_set(struct ublk_device *ub)
|
||||
@@ -1132,7 +1314,6 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
|
||||
{
|
||||
struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
|
||||
int ublksrv_pid = (int)header->data[0];
|
||||
unsigned long dev_blocks = header->data[1];
|
||||
struct ublk_device *ub;
|
||||
struct gendisk *disk;
|
||||
int ret = -EINVAL;
|
||||
@@ -1155,10 +1336,6 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
/* We may get disk size updated */
|
||||
if (dev_blocks)
|
||||
ub->dev_info.dev_blocks = dev_blocks;
|
||||
|
||||
disk = blk_mq_alloc_disk(&ub->tag_set, ub);
|
||||
if (IS_ERR(disk)) {
|
||||
ret = PTR_ERR(disk);
|
||||
@@ -1168,27 +1345,28 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
|
||||
disk->fops = &ub_fops;
|
||||
disk->private_data = ub;
|
||||
|
||||
blk_queue_logical_block_size(disk->queue, ub->dev_info.block_size);
|
||||
blk_queue_physical_block_size(disk->queue, ub->dev_info.block_size);
|
||||
blk_queue_io_min(disk->queue, ub->dev_info.block_size);
|
||||
blk_queue_max_hw_sectors(disk->queue,
|
||||
ub->dev_info.rq_max_blocks << (ub->bs_shift - 9));
|
||||
disk->queue->limits.discard_granularity = PAGE_SIZE;
|
||||
blk_queue_max_discard_sectors(disk->queue, UINT_MAX >> 9);
|
||||
blk_queue_max_write_zeroes_sectors(disk->queue, UINT_MAX >> 9);
|
||||
|
||||
set_capacity(disk, ub->dev_info.dev_blocks << (ub->bs_shift - 9));
|
||||
|
||||
ub->dev_info.ublksrv_pid = ublksrv_pid;
|
||||
ub->ub_disk = disk;
|
||||
|
||||
ret = ublk_apply_params(ub);
|
||||
if (ret)
|
||||
goto out_put_disk;
|
||||
|
||||
get_device(&ub->cdev_dev);
|
||||
ret = add_disk(disk);
|
||||
if (ret) {
|
||||
put_disk(disk);
|
||||
goto out_unlock;
|
||||
/*
|
||||
* Has to drop the reference since ->free_disk won't be
|
||||
* called in case of add_disk failure.
|
||||
*/
|
||||
ublk_put_device(ub);
|
||||
goto out_put_disk;
|
||||
}
|
||||
set_bit(UB_STATE_USED, &ub->state);
|
||||
ub->dev_info.state = UBLK_S_DEV_LIVE;
|
||||
out_put_disk:
|
||||
if (ret)
|
||||
put_disk(disk);
|
||||
out_unlock:
|
||||
mutex_unlock(&ub->mutex);
|
||||
ublk_put_device(ub);
|
||||
@@ -1250,9 +1428,8 @@ static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
|
||||
{
|
||||
pr_devel("%s: dev id %d flags %llx\n", __func__,
|
||||
info->dev_id, info->flags);
|
||||
pr_devel("\t nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
|
||||
info->nr_hw_queues, info->queue_depth,
|
||||
info->block_size, info->dev_blocks);
|
||||
pr_devel("\t nr_hw_queues %d queue_depth %d\n",
|
||||
info->nr_hw_queues, info->queue_depth);
|
||||
}
|
||||
|
||||
static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
|
||||
@@ -1312,7 +1489,6 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
|
||||
/* We are not ready to support zero copy */
|
||||
ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
|
||||
|
||||
ub->bs_shift = ilog2(ub->dev_info.block_size);
|
||||
ub->dev_info.nr_hw_queues = min_t(unsigned int,
|
||||
ub->dev_info.nr_hw_queues, nr_cpu_ids);
|
||||
ublk_align_max_io_size(ub);
|
||||
@@ -1436,6 +1612,82 @@ static int ublk_ctrl_get_dev_info(struct io_uring_cmd *cmd)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ublk_ctrl_get_params(struct io_uring_cmd *cmd)
|
||||
{
|
||||
struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
|
||||
void __user *argp = (void __user *)(unsigned long)header->addr;
|
||||
struct ublk_params_header ph;
|
||||
struct ublk_device *ub;
|
||||
int ret;
|
||||
|
||||
if (header->len <= sizeof(ph) || !header->addr)
|
||||
return -EINVAL;
|
||||
|
||||
if (copy_from_user(&ph, argp, sizeof(ph)))
|
||||
return -EFAULT;
|
||||
|
||||
if (ph.len > header->len || !ph.len)
|
||||
return -EINVAL;
|
||||
|
||||
if (ph.len > sizeof(struct ublk_params))
|
||||
ph.len = sizeof(struct ublk_params);
|
||||
|
||||
ub = ublk_get_device_from_id(header->dev_id);
|
||||
if (!ub)
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&ub->mutex);
|
||||
if (copy_to_user(argp, &ub->params, ph.len))
|
||||
ret = -EFAULT;
|
||||
else
|
||||
ret = 0;
|
||||
mutex_unlock(&ub->mutex);
|
||||
|
||||
ublk_put_device(ub);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ublk_ctrl_set_params(struct io_uring_cmd *cmd)
|
||||
{
|
||||
struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
|
||||
void __user *argp = (void __user *)(unsigned long)header->addr;
|
||||
struct ublk_params_header ph;
|
||||
struct ublk_device *ub;
|
||||
int ret = -EFAULT;
|
||||
|
||||
if (header->len <= sizeof(ph) || !header->addr)
|
||||
return -EINVAL;
|
||||
|
||||
if (copy_from_user(&ph, argp, sizeof(ph)))
|
||||
return -EFAULT;
|
||||
|
||||
if (ph.len > header->len || !ph.len || !ph.types)
|
||||
return -EINVAL;
|
||||
|
||||
if (ph.len > sizeof(struct ublk_params))
|
||||
ph.len = sizeof(struct ublk_params);
|
||||
|
||||
ub = ublk_get_device_from_id(header->dev_id);
|
||||
if (!ub)
|
||||
return -EINVAL;
|
||||
|
||||
/* parameters can only be changed when device isn't live */
|
||||
mutex_lock(&ub->mutex);
|
||||
if (ub->dev_info.state == UBLK_S_DEV_LIVE) {
|
||||
ret = -EACCES;
|
||||
} else if (copy_from_user(&ub->params, argp, ph.len)) {
|
||||
ret = -EFAULT;
|
||||
} else {
|
||||
/* clear all we don't support yet */
|
||||
ub->params.types &= UBLK_PARAM_TYPE_ALL;
|
||||
ret = ublk_validate_params(ub);
|
||||
}
|
||||
mutex_unlock(&ub->mutex);
|
||||
ublk_put_device(ub);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
@@ -1471,6 +1723,12 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
|
||||
case UBLK_CMD_GET_QUEUE_AFFINITY:
|
||||
ret = ublk_ctrl_get_queue_affinity(cmd);
|
||||
break;
|
||||
case UBLK_CMD_GET_PARAMS:
|
||||
ret = ublk_ctrl_get_params(cmd);
|
||||
break;
|
||||
case UBLK_CMD_SET_PARAMS:
|
||||
ret = ublk_ctrl_set_params(cmd);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -78,20 +78,21 @@ config INFINIBAND_VIRT_DMA
|
||||
def_bool !HIGHMEM
|
||||
|
||||
if INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS
|
||||
source "drivers/infiniband/hw/mthca/Kconfig"
|
||||
source "drivers/infiniband/hw/qib/Kconfig"
|
||||
source "drivers/infiniband/hw/bnxt_re/Kconfig"
|
||||
source "drivers/infiniband/hw/cxgb4/Kconfig"
|
||||
source "drivers/infiniband/hw/efa/Kconfig"
|
||||
source "drivers/infiniband/hw/erdma/Kconfig"
|
||||
source "drivers/infiniband/hw/hfi1/Kconfig"
|
||||
source "drivers/infiniband/hw/hns/Kconfig"
|
||||
source "drivers/infiniband/hw/irdma/Kconfig"
|
||||
source "drivers/infiniband/hw/mlx4/Kconfig"
|
||||
source "drivers/infiniband/hw/mlx5/Kconfig"
|
||||
source "drivers/infiniband/hw/mthca/Kconfig"
|
||||
source "drivers/infiniband/hw/ocrdma/Kconfig"
|
||||
source "drivers/infiniband/hw/vmw_pvrdma/Kconfig"
|
||||
source "drivers/infiniband/hw/usnic/Kconfig"
|
||||
source "drivers/infiniband/hw/hns/Kconfig"
|
||||
source "drivers/infiniband/hw/bnxt_re/Kconfig"
|
||||
source "drivers/infiniband/hw/hfi1/Kconfig"
|
||||
source "drivers/infiniband/hw/qedr/Kconfig"
|
||||
source "drivers/infiniband/hw/qib/Kconfig"
|
||||
source "drivers/infiniband/hw/usnic/Kconfig"
|
||||
source "drivers/infiniband/hw/vmw_pvrdma/Kconfig"
|
||||
source "drivers/infiniband/sw/rdmavt/Kconfig"
|
||||
source "drivers/infiniband/sw/rxe/Kconfig"
|
||||
source "drivers/infiniband/sw/siw/Kconfig"
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include <linux/in6.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/igmp.h>
|
||||
#include <linux/xarray.h>
|
||||
#include <linux/inetdevice.h>
|
||||
@@ -20,6 +21,7 @@
|
||||
|
||||
#include <net/net_namespace.h>
|
||||
#include <net/netns/generic.h>
|
||||
#include <net/netevent.h>
|
||||
#include <net/tcp.h>
|
||||
#include <net/ipv6.h>
|
||||
#include <net/ip_fib.h>
|
||||
@@ -168,6 +170,9 @@ static struct ib_sa_client sa_client;
|
||||
static LIST_HEAD(dev_list);
|
||||
static LIST_HEAD(listen_any_list);
|
||||
static DEFINE_MUTEX(lock);
|
||||
static struct rb_root id_table = RB_ROOT;
|
||||
/* Serialize operations of id_table tree */
|
||||
static DEFINE_SPINLOCK(id_table_lock);
|
||||
static struct workqueue_struct *cma_wq;
|
||||
static unsigned int cma_pernet_id;
|
||||
|
||||
@@ -202,6 +207,11 @@ struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps)
|
||||
}
|
||||
}
|
||||
|
||||
struct id_table_entry {
|
||||
struct list_head id_list;
|
||||
struct rb_node rb_node;
|
||||
};
|
||||
|
||||
struct cma_device {
|
||||
struct list_head list;
|
||||
struct ib_device *device;
|
||||
@@ -420,11 +430,21 @@ static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr)
|
||||
return hdr->ip_version >> 4;
|
||||
}
|
||||
|
||||
static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
|
||||
static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
|
||||
{
|
||||
hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
|
||||
}
|
||||
|
||||
static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv)
|
||||
{
|
||||
return (struct sockaddr *)&id_priv->id.route.addr.src_addr;
|
||||
}
|
||||
|
||||
static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv)
|
||||
{
|
||||
return (struct sockaddr *)&id_priv->id.route.addr.dst_addr;
|
||||
}
|
||||
|
||||
static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join)
|
||||
{
|
||||
struct in_device *in_dev = NULL;
|
||||
@@ -445,6 +465,117 @@ static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join)
|
||||
return (in_dev) ? 0 : -ENODEV;
|
||||
}
|
||||
|
||||
static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa,
|
||||
struct id_table_entry *entry_b)
|
||||
{
|
||||
struct rdma_id_private *id_priv = list_first_entry(
|
||||
&entry_b->id_list, struct rdma_id_private, id_list_entry);
|
||||
int ifindex_b = id_priv->id.route.addr.dev_addr.bound_dev_if;
|
||||
struct sockaddr *sb = cma_dst_addr(id_priv);
|
||||
|
||||
if (ifindex_a != ifindex_b)
|
||||
return (ifindex_a > ifindex_b) ? 1 : -1;
|
||||
|
||||
if (sa->sa_family != sb->sa_family)
|
||||
return sa->sa_family - sb->sa_family;
|
||||
|
||||
if (sa->sa_family == AF_INET)
|
||||
return memcmp((char *)&((struct sockaddr_in *)sa)->sin_addr,
|
||||
(char *)&((struct sockaddr_in *)sb)->sin_addr,
|
||||
sizeof(((struct sockaddr_in *)sa)->sin_addr));
|
||||
|
||||
return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr,
|
||||
&((struct sockaddr_in6 *)sb)->sin6_addr);
|
||||
}
|
||||
|
||||
static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv)
|
||||
{
|
||||
struct rb_node **new, *parent = NULL;
|
||||
struct id_table_entry *this, *node;
|
||||
unsigned long flags;
|
||||
int result;
|
||||
|
||||
node = kzalloc(sizeof(*node), GFP_KERNEL);
|
||||
if (!node)
|
||||
return -ENOMEM;
|
||||
|
||||
spin_lock_irqsave(&id_table_lock, flags);
|
||||
new = &id_table.rb_node;
|
||||
while (*new) {
|
||||
this = container_of(*new, struct id_table_entry, rb_node);
|
||||
result = compare_netdev_and_ip(
|
||||
node_id_priv->id.route.addr.dev_addr.bound_dev_if,
|
||||
cma_dst_addr(node_id_priv), this);
|
||||
|
||||
parent = *new;
|
||||
if (result < 0)
|
||||
new = &((*new)->rb_left);
|
||||
else if (result > 0)
|
||||
new = &((*new)->rb_right);
|
||||
else {
|
||||
list_add_tail(&node_id_priv->id_list_entry,
|
||||
&this->id_list);
|
||||
kfree(node);
|
||||
goto unlock;
|
||||
}
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&node->id_list);
|
||||
list_add_tail(&node_id_priv->id_list_entry, &node->id_list);
|
||||
|
||||
rb_link_node(&node->rb_node, parent, new);
|
||||
rb_insert_color(&node->rb_node, &id_table);
|
||||
|
||||
unlock:
|
||||
spin_unlock_irqrestore(&id_table_lock, flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct id_table_entry *
|
||||
node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa)
|
||||
{
|
||||
struct rb_node *node = root->rb_node;
|
||||
struct id_table_entry *data;
|
||||
int result;
|
||||
|
||||
while (node) {
|
||||
data = container_of(node, struct id_table_entry, rb_node);
|
||||
result = compare_netdev_and_ip(ifindex, sa, data);
|
||||
if (result < 0)
|
||||
node = node->rb_left;
|
||||
else if (result > 0)
|
||||
node = node->rb_right;
|
||||
else
|
||||
return data;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void cma_remove_id_from_tree(struct rdma_id_private *id_priv)
|
||||
{
|
||||
struct id_table_entry *data;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&id_table_lock, flags);
|
||||
if (list_empty(&id_priv->id_list_entry))
|
||||
goto out;
|
||||
|
||||
data = node_from_ndev_ip(&id_table,
|
||||
id_priv->id.route.addr.dev_addr.bound_dev_if,
|
||||
cma_dst_addr(id_priv));
|
||||
if (!data)
|
||||
goto out;
|
||||
|
||||
list_del_init(&id_priv->id_list_entry);
|
||||
if (list_empty(&data->id_list)) {
|
||||
rb_erase(&data->rb_node, &id_table);
|
||||
kfree(data);
|
||||
}
|
||||
out:
|
||||
spin_unlock_irqrestore(&id_table_lock, flags);
|
||||
}
|
||||
|
||||
static void _cma_attach_to_dev(struct rdma_id_private *id_priv,
|
||||
struct cma_device *cma_dev)
|
||||
{
|
||||
@@ -481,16 +612,6 @@ static void cma_release_dev(struct rdma_id_private *id_priv)
|
||||
mutex_unlock(&lock);
|
||||
}
|
||||
|
||||
static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv)
|
||||
{
|
||||
return (struct sockaddr *) &id_priv->id.route.addr.src_addr;
|
||||
}
|
||||
|
||||
static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv)
|
||||
{
|
||||
return (struct sockaddr *) &id_priv->id.route.addr.dst_addr;
|
||||
}
|
||||
|
||||
static inline unsigned short cma_family(struct rdma_id_private *id_priv)
|
||||
{
|
||||
return id_priv->id.route.addr.src_addr.ss_family;
|
||||
@@ -861,6 +982,7 @@ __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler,
|
||||
refcount_set(&id_priv->refcount, 1);
|
||||
mutex_init(&id_priv->handler_mutex);
|
||||
INIT_LIST_HEAD(&id_priv->device_item);
|
||||
INIT_LIST_HEAD(&id_priv->id_list_entry);
|
||||
INIT_LIST_HEAD(&id_priv->listen_list);
|
||||
INIT_LIST_HEAD(&id_priv->mc_list);
|
||||
get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
|
||||
@@ -1883,6 +2005,7 @@ static void _destroy_id(struct rdma_id_private *id_priv,
|
||||
cma_cancel_operation(id_priv, state);
|
||||
|
||||
rdma_restrack_del(&id_priv->res);
|
||||
cma_remove_id_from_tree(id_priv);
|
||||
if (id_priv->cma_dev) {
|
||||
if (rdma_cap_ib_cm(id_priv->id.device, 1)) {
|
||||
if (id_priv->cm_id.ib)
|
||||
@@ -3172,8 +3295,11 @@ int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms)
|
||||
cma_id_get(id_priv);
|
||||
if (rdma_cap_ib_sa(id->device, id->port_num))
|
||||
ret = cma_resolve_ib_route(id_priv, timeout_ms);
|
||||
else if (rdma_protocol_roce(id->device, id->port_num))
|
||||
else if (rdma_protocol_roce(id->device, id->port_num)) {
|
||||
ret = cma_resolve_iboe_route(id_priv);
|
||||
if (!ret)
|
||||
cma_add_id_to_tree(id_priv);
|
||||
}
|
||||
else if (rdma_protocol_iwarp(id->device, id->port_num))
|
||||
ret = cma_resolve_iw_route(id_priv);
|
||||
else
|
||||
@@ -4922,10 +5048,87 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void cma_netevent_work_handler(struct work_struct *_work)
|
||||
{
|
||||
struct rdma_id_private *id_priv =
|
||||
container_of(_work, struct rdma_id_private, id.net_work);
|
||||
struct rdma_cm_event event = {};
|
||||
|
||||
mutex_lock(&id_priv->handler_mutex);
|
||||
|
||||
if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING ||
|
||||
READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL)
|
||||
goto out_unlock;
|
||||
|
||||
event.event = RDMA_CM_EVENT_UNREACHABLE;
|
||||
event.status = -ETIMEDOUT;
|
||||
|
||||
if (cma_cm_event_handler(id_priv, &event)) {
|
||||
__acquire(&id_priv->handler_mutex);
|
||||
id_priv->cm_id.ib = NULL;
|
||||
cma_id_put(id_priv);
|
||||
destroy_id_handler_unlock(id_priv);
|
||||
return;
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
mutex_unlock(&id_priv->handler_mutex);
|
||||
cma_id_put(id_priv);
|
||||
}
|
||||
|
||||
static int cma_netevent_callback(struct notifier_block *self,
|
||||
unsigned long event, void *ctx)
|
||||
{
|
||||
struct id_table_entry *ips_node = NULL;
|
||||
struct rdma_id_private *current_id;
|
||||
struct neighbour *neigh = ctx;
|
||||
unsigned long flags;
|
||||
|
||||
if (event != NETEVENT_NEIGH_UPDATE)
|
||||
return NOTIFY_DONE;
|
||||
|
||||
spin_lock_irqsave(&id_table_lock, flags);
|
||||
if (neigh->tbl->family == AF_INET6) {
|
||||
struct sockaddr_in6 neigh_sock_6;
|
||||
|
||||
neigh_sock_6.sin6_family = AF_INET6;
|
||||
neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key;
|
||||
ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex,
|
||||
(struct sockaddr *)&neigh_sock_6);
|
||||
} else if (neigh->tbl->family == AF_INET) {
|
||||
struct sockaddr_in neigh_sock_4;
|
||||
|
||||
neigh_sock_4.sin_family = AF_INET;
|
||||
neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key);
|
||||
ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex,
|
||||
(struct sockaddr *)&neigh_sock_4);
|
||||
} else
|
||||
goto out;
|
||||
|
||||
if (!ips_node)
|
||||
goto out;
|
||||
|
||||
list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) {
|
||||
if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr,
|
||||
neigh->ha, ETH_ALEN))
|
||||
continue;
|
||||
INIT_WORK(¤t_id->id.net_work, cma_netevent_work_handler);
|
||||
cma_id_get(current_id);
|
||||
queue_work(cma_wq, ¤t_id->id.net_work);
|
||||
}
|
||||
out:
|
||||
spin_unlock_irqrestore(&id_table_lock, flags);
|
||||
return NOTIFY_DONE;
|
||||
}
|
||||
|
||||
static struct notifier_block cma_nb = {
|
||||
.notifier_call = cma_netdev_callback
|
||||
};
|
||||
|
||||
static struct notifier_block cma_netevent_cb = {
|
||||
.notifier_call = cma_netevent_callback
|
||||
};
|
||||
|
||||
static void cma_send_device_removal_put(struct rdma_id_private *id_priv)
|
||||
{
|
||||
struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL };
|
||||
@@ -5148,6 +5351,7 @@ static int __init cma_init(void)
|
||||
|
||||
ib_sa_register_client(&sa_client);
|
||||
register_netdevice_notifier(&cma_nb);
|
||||
register_netevent_notifier(&cma_netevent_cb);
|
||||
|
||||
ret = ib_register_client(&cma_client);
|
||||
if (ret)
|
||||
@@ -5162,6 +5366,7 @@ static int __init cma_init(void)
|
||||
err_ib:
|
||||
ib_unregister_client(&cma_client);
|
||||
err:
|
||||
unregister_netevent_notifier(&cma_netevent_cb);
|
||||
unregister_netdevice_notifier(&cma_nb);
|
||||
ib_sa_unregister_client(&sa_client);
|
||||
unregister_pernet_subsys(&cma_pernet_operations);
|
||||
@@ -5174,6 +5379,7 @@ static void __exit cma_cleanup(void)
|
||||
{
|
||||
cma_configfs_exit();
|
||||
ib_unregister_client(&cma_client);
|
||||
unregister_netevent_notifier(&cma_netevent_cb);
|
||||
unregister_netdevice_notifier(&cma_nb);
|
||||
ib_sa_unregister_client(&sa_client);
|
||||
unregister_pernet_subsys(&cma_pernet_operations);
|
||||
|
||||
@@ -64,6 +64,7 @@ struct rdma_id_private {
|
||||
struct list_head listen_item;
|
||||
struct list_head listen_list;
|
||||
};
|
||||
struct list_head id_list_entry;
|
||||
struct cma_device *cma_dev;
|
||||
struct list_head mc_list;
|
||||
|
||||
|
||||
@@ -68,7 +68,7 @@ static int uverbs_try_lock_object(struct ib_uobject *uobj,
|
||||
* In exclusive access mode, we check that the counter is zero (nobody
|
||||
* claimed this object) and we set it to -1. Releasing a shared access
|
||||
* lock is done simply by decreasing the counter. As for exclusive
|
||||
* access locks, since only a single one of them is is allowed
|
||||
* access locks, since only a single one of them is allowed
|
||||
* concurrently, setting the counter to zero is enough for releasing
|
||||
* this lock.
|
||||
*/
|
||||
|
||||
@@ -250,7 +250,7 @@ static bool upper_device_filter(struct ib_device *ib_dev, u32 port,
|
||||
|
||||
/**
|
||||
* is_upper_ndev_bond_master_filter - Check if a given netdevice
|
||||
* is bond master device of netdevice of the the RDMA device of port.
|
||||
* is bond master device of netdevice of the RDMA device of port.
|
||||
* @ib_dev: IB device to check
|
||||
* @port: Port to consider for adding default GID
|
||||
* @rdma_ndev: Pointer to rdma netdevice
|
||||
|
||||
@@ -13,3 +13,4 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/
|
||||
obj-$(CONFIG_INFINIBAND_HNS) += hns/
|
||||
obj-$(CONFIG_INFINIBAND_QEDR) += qedr/
|
||||
obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re/
|
||||
obj-$(CONFIG_INFINIBAND_ERDMA) += erdma/
|
||||
|
||||
@@ -173,7 +173,7 @@ struct bnxt_re_dev {
|
||||
/* Max of 2 lossless traffic class supported per port */
|
||||
u16 cosq[2];
|
||||
|
||||
/* QP for for handling QP1 packets */
|
||||
/* QP for handling QP1 packets */
|
||||
struct bnxt_re_gsi_context gsi_ctx;
|
||||
struct bnxt_re_stats stats;
|
||||
atomic_t nq_alloc_cnt;
|
||||
|
||||
12
drivers/infiniband/hw/erdma/Kconfig
Normal file
12
drivers/infiniband/hw/erdma/Kconfig
Normal file
@@ -0,0 +1,12 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
config INFINIBAND_ERDMA
|
||||
tristate "Alibaba Elastic RDMA Adapter (ERDMA) support"
|
||||
depends on PCI_MSI && 64BIT
|
||||
depends on INFINIBAND_ADDR_TRANS
|
||||
depends on INFINIBAND_USER_ACCESS
|
||||
help
|
||||
This is a RDMA/iWarp driver for Alibaba Elastic RDMA Adapter(ERDMA),
|
||||
which supports RDMA features in Alibaba cloud environment.
|
||||
|
||||
To compile this driver as module, choose M here. The module will be
|
||||
called erdma.
|
||||
4
drivers/infiniband/hw/erdma/Makefile
Normal file
4
drivers/infiniband/hw/erdma/Makefile
Normal file
@@ -0,0 +1,4 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
obj-$(CONFIG_INFINIBAND_ERDMA) := erdma.o
|
||||
|
||||
erdma-y := erdma_cm.o erdma_main.o erdma_cmdq.o erdma_cq.o erdma_verbs.o erdma_qp.o erdma_eq.o
|
||||
287
drivers/infiniband/hw/erdma/erdma.h
Normal file
287
drivers/infiniband/hw/erdma/erdma.h
Normal file
@@ -0,0 +1,287 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
|
||||
|
||||
/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
|
||||
/* Kai Shen <kaishen@linux.alibaba.com> */
|
||||
/* Copyright (c) 2020-2022, Alibaba Group. */
|
||||
|
||||
#ifndef __ERDMA_H__
|
||||
#define __ERDMA_H__
|
||||
|
||||
#include <linux/bitfield.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/xarray.h>
|
||||
#include <rdma/ib_verbs.h>
|
||||
|
||||
#include "erdma_hw.h"
|
||||
|
||||
#define DRV_MODULE_NAME "erdma"
|
||||
#define ERDMA_NODE_DESC "Elastic RDMA(iWARP) stack"
|
||||
|
||||
struct erdma_eq {
|
||||
void *qbuf;
|
||||
dma_addr_t qbuf_dma_addr;
|
||||
|
||||
spinlock_t lock;
|
||||
|
||||
u32 depth;
|
||||
|
||||
u16 ci;
|
||||
u16 rsvd;
|
||||
|
||||
atomic64_t event_num;
|
||||
atomic64_t notify_num;
|
||||
|
||||
u64 __iomem *db_addr;
|
||||
u64 *db_record;
|
||||
};
|
||||
|
||||
struct erdma_cmdq_sq {
|
||||
void *qbuf;
|
||||
dma_addr_t qbuf_dma_addr;
|
||||
|
||||
spinlock_t lock;
|
||||
|
||||
u32 depth;
|
||||
u16 ci;
|
||||
u16 pi;
|
||||
|
||||
u16 wqebb_cnt;
|
||||
|
||||
u64 *db_record;
|
||||
};
|
||||
|
||||
struct erdma_cmdq_cq {
|
||||
void *qbuf;
|
||||
dma_addr_t qbuf_dma_addr;
|
||||
|
||||
spinlock_t lock;
|
||||
|
||||
u32 depth;
|
||||
u32 ci;
|
||||
u32 cmdsn;
|
||||
|
||||
u64 *db_record;
|
||||
|
||||
atomic64_t armed_num;
|
||||
};
|
||||
|
||||
enum {
|
||||
ERDMA_CMD_STATUS_INIT,
|
||||
ERDMA_CMD_STATUS_ISSUED,
|
||||
ERDMA_CMD_STATUS_FINISHED,
|
||||
ERDMA_CMD_STATUS_TIMEOUT
|
||||
};
|
||||
|
||||
struct erdma_comp_wait {
|
||||
struct completion wait_event;
|
||||
u32 cmd_status;
|
||||
u32 ctx_id;
|
||||
u16 sq_pi;
|
||||
u8 comp_status;
|
||||
u8 rsvd;
|
||||
u32 comp_data[4];
|
||||
};
|
||||
|
||||
enum {
|
||||
ERDMA_CMDQ_STATE_OK_BIT = 0,
|
||||
ERDMA_CMDQ_STATE_TIMEOUT_BIT = 1,
|
||||
ERDMA_CMDQ_STATE_CTX_ERR_BIT = 2,
|
||||
};
|
||||
|
||||
#define ERDMA_CMDQ_TIMEOUT_MS 15000
|
||||
#define ERDMA_REG_ACCESS_WAIT_MS 20
|
||||
#define ERDMA_WAIT_DEV_DONE_CNT 500
|
||||
|
||||
struct erdma_cmdq {
|
||||
unsigned long *comp_wait_bitmap;
|
||||
struct erdma_comp_wait *wait_pool;
|
||||
spinlock_t lock;
|
||||
|
||||
bool use_event;
|
||||
|
||||
struct erdma_cmdq_sq sq;
|
||||
struct erdma_cmdq_cq cq;
|
||||
struct erdma_eq eq;
|
||||
|
||||
unsigned long state;
|
||||
|
||||
struct semaphore credits;
|
||||
u16 max_outstandings;
|
||||
};
|
||||
|
||||
#define COMPROMISE_CC ERDMA_CC_CUBIC
|
||||
enum erdma_cc_alg {
|
||||
ERDMA_CC_NEWRENO = 0,
|
||||
ERDMA_CC_CUBIC,
|
||||
ERDMA_CC_HPCC_RTT,
|
||||
ERDMA_CC_HPCC_ECN,
|
||||
ERDMA_CC_HPCC_INT,
|
||||
ERDMA_CC_METHODS_NUM
|
||||
};
|
||||
|
||||
struct erdma_devattr {
|
||||
u32 fw_version;
|
||||
|
||||
unsigned char peer_addr[ETH_ALEN];
|
||||
|
||||
int numa_node;
|
||||
enum erdma_cc_alg cc;
|
||||
u32 grp_num;
|
||||
u32 irq_num;
|
||||
|
||||
bool disable_dwqe;
|
||||
u16 dwqe_pages;
|
||||
u16 dwqe_entries;
|
||||
|
||||
u32 max_qp;
|
||||
u32 max_send_wr;
|
||||
u32 max_recv_wr;
|
||||
u32 max_ord;
|
||||
u32 max_ird;
|
||||
|
||||
u32 max_send_sge;
|
||||
u32 max_recv_sge;
|
||||
u32 max_sge_rd;
|
||||
u32 max_cq;
|
||||
u32 max_cqe;
|
||||
u64 max_mr_size;
|
||||
u32 max_mr;
|
||||
u32 max_pd;
|
||||
u32 max_mw;
|
||||
u32 local_dma_key;
|
||||
};
|
||||
|
||||
#define ERDMA_IRQNAME_SIZE 50
|
||||
|
||||
struct erdma_irq {
|
||||
char name[ERDMA_IRQNAME_SIZE];
|
||||
u32 msix_vector;
|
||||
cpumask_t affinity_hint_mask;
|
||||
};
|
||||
|
||||
struct erdma_eq_cb {
|
||||
bool ready;
|
||||
void *dev; /* All EQs use this fields to get erdma_dev struct */
|
||||
struct erdma_irq irq;
|
||||
struct erdma_eq eq;
|
||||
struct tasklet_struct tasklet;
|
||||
};
|
||||
|
||||
struct erdma_resource_cb {
|
||||
unsigned long *bitmap;
|
||||
spinlock_t lock;
|
||||
u32 next_alloc_idx;
|
||||
u32 max_cap;
|
||||
};
|
||||
|
||||
enum {
|
||||
ERDMA_RES_TYPE_PD = 0,
|
||||
ERDMA_RES_TYPE_STAG_IDX = 1,
|
||||
ERDMA_RES_CNT = 2,
|
||||
};
|
||||
|
||||
#define ERDMA_EXTRA_BUFFER_SIZE ERDMA_DB_SIZE
|
||||
#define WARPPED_BUFSIZE(size) ((size) + ERDMA_EXTRA_BUFFER_SIZE)
|
||||
|
||||
struct erdma_dev {
|
||||
struct ib_device ibdev;
|
||||
struct net_device *netdev;
|
||||
struct pci_dev *pdev;
|
||||
struct notifier_block netdev_nb;
|
||||
|
||||
resource_size_t func_bar_addr;
|
||||
resource_size_t func_bar_len;
|
||||
u8 __iomem *func_bar;
|
||||
|
||||
struct erdma_devattr attrs;
|
||||
/* physical port state (only one port per device) */
|
||||
enum ib_port_state state;
|
||||
|
||||
/* cmdq and aeq use the same msix vector */
|
||||
struct erdma_irq comm_irq;
|
||||
struct erdma_cmdq cmdq;
|
||||
struct erdma_eq aeq;
|
||||
struct erdma_eq_cb ceqs[ERDMA_NUM_MSIX_VEC - 1];
|
||||
|
||||
spinlock_t lock;
|
||||
struct erdma_resource_cb res_cb[ERDMA_RES_CNT];
|
||||
struct xarray qp_xa;
|
||||
struct xarray cq_xa;
|
||||
|
||||
u32 next_alloc_qpn;
|
||||
u32 next_alloc_cqn;
|
||||
|
||||
spinlock_t db_bitmap_lock;
|
||||
/* We provide max 64 uContexts that each has one SQ doorbell Page. */
|
||||
DECLARE_BITMAP(sdb_page, ERDMA_DWQE_TYPE0_CNT);
|
||||
/*
|
||||
* We provide max 496 uContexts that each has one SQ normal Db,
|
||||
* and one directWQE db。
|
||||
*/
|
||||
DECLARE_BITMAP(sdb_entry, ERDMA_DWQE_TYPE1_CNT);
|
||||
|
||||
atomic_t num_ctx;
|
||||
struct list_head cep_list;
|
||||
};
|
||||
|
||||
static inline void *get_queue_entry(void *qbuf, u32 idx, u32 depth, u32 shift)
|
||||
{
|
||||
idx &= (depth - 1);
|
||||
|
||||
return qbuf + (idx << shift);
|
||||
}
|
||||
|
||||
static inline struct erdma_dev *to_edev(struct ib_device *ibdev)
|
||||
{
|
||||
return container_of(ibdev, struct erdma_dev, ibdev);
|
||||
}
|
||||
|
||||
static inline u32 erdma_reg_read32(struct erdma_dev *dev, u32 reg)
|
||||
{
|
||||
return readl(dev->func_bar + reg);
|
||||
}
|
||||
|
||||
static inline u64 erdma_reg_read64(struct erdma_dev *dev, u32 reg)
|
||||
{
|
||||
return readq(dev->func_bar + reg);
|
||||
}
|
||||
|
||||
static inline void erdma_reg_write32(struct erdma_dev *dev, u32 reg, u32 value)
|
||||
{
|
||||
writel(value, dev->func_bar + reg);
|
||||
}
|
||||
|
||||
static inline void erdma_reg_write64(struct erdma_dev *dev, u32 reg, u64 value)
|
||||
{
|
||||
writeq(value, dev->func_bar + reg);
|
||||
}
|
||||
|
||||
static inline u32 erdma_reg_read32_filed(struct erdma_dev *dev, u32 reg,
|
||||
u32 filed_mask)
|
||||
{
|
||||
u32 val = erdma_reg_read32(dev, reg);
|
||||
|
||||
return FIELD_GET(filed_mask, val);
|
||||
}
|
||||
|
||||
int erdma_cmdq_init(struct erdma_dev *dev);
|
||||
void erdma_finish_cmdq_init(struct erdma_dev *dev);
|
||||
void erdma_cmdq_destroy(struct erdma_dev *dev);
|
||||
|
||||
void erdma_cmdq_build_reqhdr(u64 *hdr, u32 mod, u32 op);
|
||||
int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, u64 *req, u32 req_size,
|
||||
u64 *resp0, u64 *resp1);
|
||||
void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq);
|
||||
|
||||
int erdma_ceqs_init(struct erdma_dev *dev);
|
||||
void erdma_ceqs_uninit(struct erdma_dev *dev);
|
||||
void notify_eq(struct erdma_eq *eq);
|
||||
void *get_next_valid_eqe(struct erdma_eq *eq);
|
||||
|
||||
int erdma_aeq_init(struct erdma_dev *dev);
|
||||
void erdma_aeq_destroy(struct erdma_dev *dev);
|
||||
|
||||
void erdma_aeq_event_handler(struct erdma_dev *dev);
|
||||
void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb);
|
||||
|
||||
#endif
|
||||
1430
drivers/infiniband/hw/erdma/erdma_cm.c
Normal file
1430
drivers/infiniband/hw/erdma/erdma_cm.c
Normal file
File diff suppressed because it is too large
Load Diff
167
drivers/infiniband/hw/erdma/erdma_cm.h
Normal file
167
drivers/infiniband/hw/erdma/erdma_cm.h
Normal file
@@ -0,0 +1,167 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
|
||||
|
||||
/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
|
||||
/* Kai Shen <kaishen@linux.alibaba.com> */
|
||||
/* Copyright (c) 2020-2022, Alibaba Group. */
|
||||
|
||||
/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
|
||||
/* Greg Joyce <greg@opengridcomputing.com> */
|
||||
/* Copyright (c) 2008-2019, IBM Corporation */
|
||||
/* Copyright (c) 2017, Open Grid Computing, Inc. */
|
||||
|
||||
#ifndef __ERDMA_CM_H__
|
||||
#define __ERDMA_CM_H__
|
||||
|
||||
#include <linux/tcp.h>
|
||||
#include <net/sock.h>
|
||||
#include <rdma/iw_cm.h>
|
||||
|
||||
/* iWarp MPA protocol defs */
|
||||
#define MPA_REVISION_EXT_1 129
|
||||
#define MPA_MAX_PRIVDATA RDMA_MAX_PRIVATE_DATA
|
||||
#define MPA_KEY_REQ "MPA ID Req Frame"
|
||||
#define MPA_KEY_REP "MPA ID Rep Frame"
|
||||
#define MPA_KEY_SIZE 16
|
||||
#define MPA_DEFAULT_HDR_LEN 28
|
||||
|
||||
struct mpa_rr_params {
|
||||
__be16 bits;
|
||||
__be16 pd_len;
|
||||
};
|
||||
|
||||
/*
|
||||
* MPA request/response Hdr bits & fields
|
||||
*/
|
||||
enum {
|
||||
MPA_RR_FLAG_MARKERS = __cpu_to_be16(0x8000),
|
||||
MPA_RR_FLAG_CRC = __cpu_to_be16(0x4000),
|
||||
MPA_RR_FLAG_REJECT = __cpu_to_be16(0x2000),
|
||||
MPA_RR_RESERVED = __cpu_to_be16(0x1f00),
|
||||
MPA_RR_MASK_REVISION = __cpu_to_be16(0x00ff)
|
||||
};
|
||||
|
||||
/*
|
||||
* MPA request/reply header
|
||||
*/
|
||||
struct mpa_rr {
|
||||
u8 key[16];
|
||||
struct mpa_rr_params params;
|
||||
};
|
||||
|
||||
struct erdma_mpa_ext {
|
||||
__be32 cookie;
|
||||
__be32 bits;
|
||||
};
|
||||
|
||||
enum {
|
||||
MPA_EXT_FLAG_CC = cpu_to_be32(0x0000000f),
|
||||
};
|
||||
|
||||
struct erdma_mpa_info {
|
||||
struct mpa_rr hdr; /* peer mpa hdr in host byte order */
|
||||
struct erdma_mpa_ext ext_data;
|
||||
char *pdata;
|
||||
int bytes_rcvd;
|
||||
};
|
||||
|
||||
struct erdma_sk_upcalls {
|
||||
void (*sk_state_change)(struct sock *sk);
|
||||
void (*sk_data_ready)(struct sock *sk, int bytes);
|
||||
void (*sk_error_report)(struct sock *sk);
|
||||
};
|
||||
|
||||
struct erdma_dev;
|
||||
|
||||
enum erdma_cep_state {
|
||||
ERDMA_EPSTATE_IDLE = 1,
|
||||
ERDMA_EPSTATE_LISTENING,
|
||||
ERDMA_EPSTATE_CONNECTING,
|
||||
ERDMA_EPSTATE_AWAIT_MPAREQ,
|
||||
ERDMA_EPSTATE_RECVD_MPAREQ,
|
||||
ERDMA_EPSTATE_AWAIT_MPAREP,
|
||||
ERDMA_EPSTATE_RDMA_MODE,
|
||||
ERDMA_EPSTATE_CLOSED
|
||||
};
|
||||
|
||||
struct erdma_cep {
|
||||
struct iw_cm_id *cm_id;
|
||||
struct erdma_dev *dev;
|
||||
struct list_head devq;
|
||||
spinlock_t lock;
|
||||
struct kref ref;
|
||||
int in_use;
|
||||
wait_queue_head_t waitq;
|
||||
enum erdma_cep_state state;
|
||||
|
||||
struct list_head listenq;
|
||||
struct erdma_cep *listen_cep;
|
||||
|
||||
struct erdma_qp *qp;
|
||||
struct socket *sock;
|
||||
|
||||
struct erdma_cm_work *mpa_timer;
|
||||
struct list_head work_freelist;
|
||||
|
||||
struct erdma_mpa_info mpa;
|
||||
int ord;
|
||||
int ird;
|
||||
|
||||
int pd_len;
|
||||
/* hold user's private data. */
|
||||
void *private_data;
|
||||
|
||||
/* Saved upcalls of socket llp.sock */
|
||||
void (*sk_state_change)(struct sock *sk);
|
||||
void (*sk_data_ready)(struct sock *sk);
|
||||
void (*sk_error_report)(struct sock *sk);
|
||||
};
|
||||
|
||||
#define MPAREQ_TIMEOUT (HZ * 20)
|
||||
#define MPAREP_TIMEOUT (HZ * 10)
|
||||
#define CONNECT_TIMEOUT (HZ * 10)
|
||||
|
||||
enum erdma_work_type {
|
||||
ERDMA_CM_WORK_ACCEPT = 1,
|
||||
ERDMA_CM_WORK_READ_MPAHDR,
|
||||
ERDMA_CM_WORK_CLOSE_LLP, /* close socket */
|
||||
ERDMA_CM_WORK_PEER_CLOSE, /* socket indicated peer close */
|
||||
ERDMA_CM_WORK_MPATIMEOUT,
|
||||
ERDMA_CM_WORK_CONNECTED,
|
||||
ERDMA_CM_WORK_CONNECTTIMEOUT
|
||||
};
|
||||
|
||||
struct erdma_cm_work {
|
||||
struct delayed_work work;
|
||||
struct list_head list;
|
||||
enum erdma_work_type type;
|
||||
struct erdma_cep *cep;
|
||||
};
|
||||
|
||||
#define to_sockaddr_in(a) (*(struct sockaddr_in *)(&(a)))
|
||||
|
||||
static inline int getname_peer(struct socket *s, struct sockaddr_storage *a)
|
||||
{
|
||||
return s->ops->getname(s, (struct sockaddr *)a, 1);
|
||||
}
|
||||
|
||||
static inline int getname_local(struct socket *s, struct sockaddr_storage *a)
|
||||
{
|
||||
return s->ops->getname(s, (struct sockaddr *)a, 0);
|
||||
}
|
||||
|
||||
int erdma_connect(struct iw_cm_id *id, struct iw_cm_conn_param *param);
|
||||
int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *param);
|
||||
int erdma_reject(struct iw_cm_id *id, const void *pdata, u8 plen);
|
||||
int erdma_create_listen(struct iw_cm_id *id, int backlog);
|
||||
int erdma_destroy_listen(struct iw_cm_id *id);
|
||||
|
||||
void erdma_cep_get(struct erdma_cep *ceq);
|
||||
void erdma_cep_put(struct erdma_cep *ceq);
|
||||
int erdma_cm_queue_work(struct erdma_cep *ceq, enum erdma_work_type type);
|
||||
|
||||
int erdma_cm_init(void);
|
||||
void erdma_cm_exit(void);
|
||||
|
||||
#define sk_to_cep(sk) ((struct erdma_cep *)((sk)->sk_user_data))
|
||||
|
||||
#endif
|
||||
493
drivers/infiniband/hw/erdma/erdma_cmdq.c
Normal file
493
drivers/infiniband/hw/erdma/erdma_cmdq.c
Normal file
@@ -0,0 +1,493 @@
|
||||
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
|
||||
|
||||
/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
|
||||
/* Kai Shen <kaishen@linux.alibaba.com> */
|
||||
/* Copyright (c) 2020-2022, Alibaba Group. */
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/pci.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
#include "erdma.h"
|
||||
#include "erdma_hw.h"
|
||||
#include "erdma_verbs.h"
|
||||
|
||||
static void arm_cmdq_cq(struct erdma_cmdq *cmdq)
|
||||
{
|
||||
struct erdma_dev *dev = container_of(cmdq, struct erdma_dev, cmdq);
|
||||
u64 db_data = FIELD_PREP(ERDMA_CQDB_CI_MASK, cmdq->cq.ci) |
|
||||
FIELD_PREP(ERDMA_CQDB_ARM_MASK, 1) |
|
||||
FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cmdq->cq.cmdsn) |
|
||||
FIELD_PREP(ERDMA_CQDB_IDX_MASK, cmdq->cq.cmdsn);
|
||||
|
||||
*cmdq->cq.db_record = db_data;
|
||||
writeq(db_data, dev->func_bar + ERDMA_CMDQ_CQDB_REG);
|
||||
|
||||
atomic64_inc(&cmdq->cq.armed_num);
|
||||
}
|
||||
|
||||
static void kick_cmdq_db(struct erdma_cmdq *cmdq)
|
||||
{
|
||||
struct erdma_dev *dev = container_of(cmdq, struct erdma_dev, cmdq);
|
||||
u64 db_data = FIELD_PREP(ERDMA_CMD_HDR_WQEBB_INDEX_MASK, cmdq->sq.pi);
|
||||
|
||||
*cmdq->sq.db_record = db_data;
|
||||
writeq(db_data, dev->func_bar + ERDMA_CMDQ_SQDB_REG);
|
||||
}
|
||||
|
||||
static struct erdma_comp_wait *get_comp_wait(struct erdma_cmdq *cmdq)
|
||||
{
|
||||
int comp_idx;
|
||||
|
||||
spin_lock(&cmdq->lock);
|
||||
comp_idx = find_first_zero_bit(cmdq->comp_wait_bitmap,
|
||||
cmdq->max_outstandings);
|
||||
if (comp_idx == cmdq->max_outstandings) {
|
||||
spin_unlock(&cmdq->lock);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
__set_bit(comp_idx, cmdq->comp_wait_bitmap);
|
||||
spin_unlock(&cmdq->lock);
|
||||
|
||||
return &cmdq->wait_pool[comp_idx];
|
||||
}
|
||||
|
||||
static void put_comp_wait(struct erdma_cmdq *cmdq,
|
||||
struct erdma_comp_wait *comp_wait)
|
||||
{
|
||||
int used;
|
||||
|
||||
cmdq->wait_pool[comp_wait->ctx_id].cmd_status = ERDMA_CMD_STATUS_INIT;
|
||||
spin_lock(&cmdq->lock);
|
||||
used = __test_and_clear_bit(comp_wait->ctx_id, cmdq->comp_wait_bitmap);
|
||||
spin_unlock(&cmdq->lock);
|
||||
|
||||
WARN_ON(!used);
|
||||
}
|
||||
|
||||
static int erdma_cmdq_wait_res_init(struct erdma_dev *dev,
|
||||
struct erdma_cmdq *cmdq)
|
||||
{
|
||||
int i;
|
||||
|
||||
cmdq->wait_pool =
|
||||
devm_kcalloc(&dev->pdev->dev, cmdq->max_outstandings,
|
||||
sizeof(struct erdma_comp_wait), GFP_KERNEL);
|
||||
if (!cmdq->wait_pool)
|
||||
return -ENOMEM;
|
||||
|
||||
spin_lock_init(&cmdq->lock);
|
||||
cmdq->comp_wait_bitmap = devm_bitmap_zalloc(
|
||||
&dev->pdev->dev, cmdq->max_outstandings, GFP_KERNEL);
|
||||
if (!cmdq->comp_wait_bitmap)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i = 0; i < cmdq->max_outstandings; i++) {
|
||||
init_completion(&cmdq->wait_pool[i].wait_event);
|
||||
cmdq->wait_pool[i].ctx_id = i;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int erdma_cmdq_sq_init(struct erdma_dev *dev)
|
||||
{
|
||||
struct erdma_cmdq *cmdq = &dev->cmdq;
|
||||
struct erdma_cmdq_sq *sq = &cmdq->sq;
|
||||
u32 buf_size;
|
||||
|
||||
sq->wqebb_cnt = SQEBB_COUNT(ERDMA_CMDQ_SQE_SIZE);
|
||||
sq->depth = cmdq->max_outstandings * sq->wqebb_cnt;
|
||||
|
||||
buf_size = sq->depth << SQEBB_SHIFT;
|
||||
|
||||
sq->qbuf =
|
||||
dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size),
|
||||
&sq->qbuf_dma_addr, GFP_KERNEL);
|
||||
if (!sq->qbuf)
|
||||
return -ENOMEM;
|
||||
|
||||
sq->db_record = (u64 *)(sq->qbuf + buf_size);
|
||||
|
||||
spin_lock_init(&sq->lock);
|
||||
|
||||
erdma_reg_write32(dev, ERDMA_REGS_CMDQ_SQ_ADDR_H_REG,
|
||||
upper_32_bits(sq->qbuf_dma_addr));
|
||||
erdma_reg_write32(dev, ERDMA_REGS_CMDQ_SQ_ADDR_L_REG,
|
||||
lower_32_bits(sq->qbuf_dma_addr));
|
||||
erdma_reg_write32(dev, ERDMA_REGS_CMDQ_DEPTH_REG, sq->depth);
|
||||
erdma_reg_write64(dev, ERDMA_CMDQ_SQ_DB_HOST_ADDR_REG,
|
||||
sq->qbuf_dma_addr + buf_size);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int erdma_cmdq_cq_init(struct erdma_dev *dev)
|
||||
{
|
||||
struct erdma_cmdq *cmdq = &dev->cmdq;
|
||||
struct erdma_cmdq_cq *cq = &cmdq->cq;
|
||||
u32 buf_size;
|
||||
|
||||
cq->depth = cmdq->sq.depth;
|
||||
buf_size = cq->depth << CQE_SHIFT;
|
||||
|
||||
cq->qbuf =
|
||||
dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size),
|
||||
&cq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO);
|
||||
if (!cq->qbuf)
|
||||
return -ENOMEM;
|
||||
|
||||
spin_lock_init(&cq->lock);
|
||||
|
||||
cq->db_record = (u64 *)(cq->qbuf + buf_size);
|
||||
|
||||
atomic64_set(&cq->armed_num, 0);
|
||||
|
||||
erdma_reg_write32(dev, ERDMA_REGS_CMDQ_CQ_ADDR_H_REG,
|
||||
upper_32_bits(cq->qbuf_dma_addr));
|
||||
erdma_reg_write32(dev, ERDMA_REGS_CMDQ_CQ_ADDR_L_REG,
|
||||
lower_32_bits(cq->qbuf_dma_addr));
|
||||
erdma_reg_write64(dev, ERDMA_CMDQ_CQ_DB_HOST_ADDR_REG,
|
||||
cq->qbuf_dma_addr + buf_size);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int erdma_cmdq_eq_init(struct erdma_dev *dev)
|
||||
{
|
||||
struct erdma_cmdq *cmdq = &dev->cmdq;
|
||||
struct erdma_eq *eq = &cmdq->eq;
|
||||
u32 buf_size;
|
||||
|
||||
eq->depth = cmdq->max_outstandings;
|
||||
buf_size = eq->depth << EQE_SHIFT;
|
||||
|
||||
eq->qbuf =
|
||||
dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size),
|
||||
&eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO);
|
||||
if (!eq->qbuf)
|
||||
return -ENOMEM;
|
||||
|
||||
spin_lock_init(&eq->lock);
|
||||
atomic64_set(&eq->event_num, 0);
|
||||
|
||||
eq->db_addr =
|
||||
(u64 __iomem *)(dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG);
|
||||
eq->db_record = (u64 *)(eq->qbuf + buf_size);
|
||||
|
||||
erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_H_REG,
|
||||
upper_32_bits(eq->qbuf_dma_addr));
|
||||
erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_L_REG,
|
||||
lower_32_bits(eq->qbuf_dma_addr));
|
||||
erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_DEPTH_REG, eq->depth);
|
||||
erdma_reg_write64(dev, ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG,
|
||||
eq->qbuf_dma_addr + buf_size);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int erdma_cmdq_init(struct erdma_dev *dev)
|
||||
{
|
||||
int err, i;
|
||||
struct erdma_cmdq *cmdq = &dev->cmdq;
|
||||
u32 sts, ctrl;
|
||||
|
||||
cmdq->max_outstandings = ERDMA_CMDQ_MAX_OUTSTANDING;
|
||||
cmdq->use_event = false;
|
||||
|
||||
sema_init(&cmdq->credits, cmdq->max_outstandings);
|
||||
|
||||
err = erdma_cmdq_wait_res_init(dev, cmdq);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = erdma_cmdq_sq_init(dev);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = erdma_cmdq_cq_init(dev);
|
||||
if (err)
|
||||
goto err_destroy_sq;
|
||||
|
||||
err = erdma_cmdq_eq_init(dev);
|
||||
if (err)
|
||||
goto err_destroy_cq;
|
||||
|
||||
ctrl = FIELD_PREP(ERDMA_REG_DEV_CTRL_INIT_MASK, 1);
|
||||
erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG, ctrl);
|
||||
|
||||
for (i = 0; i < ERDMA_WAIT_DEV_DONE_CNT; i++) {
|
||||
sts = erdma_reg_read32_filed(dev, ERDMA_REGS_DEV_ST_REG,
|
||||
ERDMA_REG_DEV_ST_INIT_DONE_MASK);
|
||||
if (sts)
|
||||
break;
|
||||
|
||||
msleep(ERDMA_REG_ACCESS_WAIT_MS);
|
||||
}
|
||||
|
||||
if (i == ERDMA_WAIT_DEV_DONE_CNT) {
|
||||
dev_err(&dev->pdev->dev, "wait init done failed.\n");
|
||||
err = -ETIMEDOUT;
|
||||
goto err_destroy_eq;
|
||||
}
|
||||
|
||||
set_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state);
|
||||
|
||||
return 0;
|
||||
|
||||
err_destroy_eq:
|
||||
dma_free_coherent(&dev->pdev->dev,
|
||||
(cmdq->eq.depth << EQE_SHIFT) +
|
||||
ERDMA_EXTRA_BUFFER_SIZE,
|
||||
cmdq->eq.qbuf, cmdq->eq.qbuf_dma_addr);
|
||||
|
||||
err_destroy_cq:
|
||||
dma_free_coherent(&dev->pdev->dev,
|
||||
(cmdq->cq.depth << CQE_SHIFT) +
|
||||
ERDMA_EXTRA_BUFFER_SIZE,
|
||||
cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr);
|
||||
|
||||
err_destroy_sq:
|
||||
dma_free_coherent(&dev->pdev->dev,
|
||||
(cmdq->sq.depth << SQEBB_SHIFT) +
|
||||
ERDMA_EXTRA_BUFFER_SIZE,
|
||||
cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
void erdma_finish_cmdq_init(struct erdma_dev *dev)
|
||||
{
|
||||
/* after device init successfully, change cmdq to event mode. */
|
||||
dev->cmdq.use_event = true;
|
||||
arm_cmdq_cq(&dev->cmdq);
|
||||
}
|
||||
|
||||
void erdma_cmdq_destroy(struct erdma_dev *dev)
|
||||
{
|
||||
struct erdma_cmdq *cmdq = &dev->cmdq;
|
||||
|
||||
clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state);
|
||||
|
||||
dma_free_coherent(&dev->pdev->dev,
|
||||
(cmdq->eq.depth << EQE_SHIFT) +
|
||||
ERDMA_EXTRA_BUFFER_SIZE,
|
||||
cmdq->eq.qbuf, cmdq->eq.qbuf_dma_addr);
|
||||
dma_free_coherent(&dev->pdev->dev,
|
||||
(cmdq->sq.depth << SQEBB_SHIFT) +
|
||||
ERDMA_EXTRA_BUFFER_SIZE,
|
||||
cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr);
|
||||
dma_free_coherent(&dev->pdev->dev,
|
||||
(cmdq->cq.depth << CQE_SHIFT) +
|
||||
ERDMA_EXTRA_BUFFER_SIZE,
|
||||
cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr);
|
||||
}
|
||||
|
||||
static void *get_next_valid_cmdq_cqe(struct erdma_cmdq *cmdq)
|
||||
{
|
||||
__be32 *cqe = get_queue_entry(cmdq->cq.qbuf, cmdq->cq.ci,
|
||||
cmdq->cq.depth, CQE_SHIFT);
|
||||
u32 owner = FIELD_GET(ERDMA_CQE_HDR_OWNER_MASK,
|
||||
__be32_to_cpu(READ_ONCE(*cqe)));
|
||||
|
||||
return owner ^ !!(cmdq->cq.ci & cmdq->cq.depth) ? cqe : NULL;
|
||||
}
|
||||
|
||||
static void push_cmdq_sqe(struct erdma_cmdq *cmdq, u64 *req, size_t req_len,
|
||||
struct erdma_comp_wait *comp_wait)
|
||||
{
|
||||
__le64 *wqe;
|
||||
u64 hdr = *req;
|
||||
|
||||
comp_wait->cmd_status = ERDMA_CMD_STATUS_ISSUED;
|
||||
reinit_completion(&comp_wait->wait_event);
|
||||
comp_wait->sq_pi = cmdq->sq.pi;
|
||||
|
||||
wqe = get_queue_entry(cmdq->sq.qbuf, cmdq->sq.pi, cmdq->sq.depth,
|
||||
SQEBB_SHIFT);
|
||||
memcpy(wqe, req, req_len);
|
||||
|
||||
cmdq->sq.pi += cmdq->sq.wqebb_cnt;
|
||||
hdr |= FIELD_PREP(ERDMA_CMD_HDR_WQEBB_INDEX_MASK, cmdq->sq.pi) |
|
||||
FIELD_PREP(ERDMA_CMD_HDR_CONTEXT_COOKIE_MASK,
|
||||
comp_wait->ctx_id) |
|
||||
FIELD_PREP(ERDMA_CMD_HDR_WQEBB_CNT_MASK, cmdq->sq.wqebb_cnt - 1);
|
||||
*wqe = cpu_to_le64(hdr);
|
||||
|
||||
kick_cmdq_db(cmdq);
|
||||
}
|
||||
|
||||
static int erdma_poll_single_cmd_completion(struct erdma_cmdq *cmdq)
|
||||
{
|
||||
struct erdma_comp_wait *comp_wait;
|
||||
u32 hdr0, sqe_idx;
|
||||
__be32 *cqe;
|
||||
u16 ctx_id;
|
||||
u64 *sqe;
|
||||
int i;
|
||||
|
||||
cqe = get_next_valid_cmdq_cqe(cmdq);
|
||||
if (!cqe)
|
||||
return -EAGAIN;
|
||||
|
||||
cmdq->cq.ci++;
|
||||
|
||||
dma_rmb();
|
||||
hdr0 = __be32_to_cpu(*cqe);
|
||||
sqe_idx = __be32_to_cpu(*(cqe + 1));
|
||||
|
||||
sqe = get_queue_entry(cmdq->sq.qbuf, sqe_idx, cmdq->sq.depth,
|
||||
SQEBB_SHIFT);
|
||||
ctx_id = FIELD_GET(ERDMA_CMD_HDR_CONTEXT_COOKIE_MASK, *sqe);
|
||||
comp_wait = &cmdq->wait_pool[ctx_id];
|
||||
if (comp_wait->cmd_status != ERDMA_CMD_STATUS_ISSUED)
|
||||
return -EIO;
|
||||
|
||||
comp_wait->cmd_status = ERDMA_CMD_STATUS_FINISHED;
|
||||
comp_wait->comp_status = FIELD_GET(ERDMA_CQE_HDR_SYNDROME_MASK, hdr0);
|
||||
cmdq->sq.ci += cmdq->sq.wqebb_cnt;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
comp_wait->comp_data[i] = __be32_to_cpu(*(cqe + 2 + i));
|
||||
|
||||
if (cmdq->use_event)
|
||||
complete(&comp_wait->wait_event);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void erdma_polling_cmd_completions(struct erdma_cmdq *cmdq)
|
||||
{
|
||||
unsigned long flags;
|
||||
u16 comp_num;
|
||||
|
||||
spin_lock_irqsave(&cmdq->cq.lock, flags);
|
||||
|
||||
/* We must have less than # of max_outstandings
|
||||
* completions at one time.
|
||||
*/
|
||||
for (comp_num = 0; comp_num < cmdq->max_outstandings; comp_num++)
|
||||
if (erdma_poll_single_cmd_completion(cmdq))
|
||||
break;
|
||||
|
||||
if (comp_num && cmdq->use_event)
|
||||
arm_cmdq_cq(cmdq);
|
||||
|
||||
spin_unlock_irqrestore(&cmdq->cq.lock, flags);
|
||||
}
|
||||
|
||||
void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq)
|
||||
{
|
||||
int got_event = 0;
|
||||
|
||||
if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state) ||
|
||||
!cmdq->use_event)
|
||||
return;
|
||||
|
||||
while (get_next_valid_eqe(&cmdq->eq)) {
|
||||
cmdq->eq.ci++;
|
||||
got_event++;
|
||||
}
|
||||
|
||||
if (got_event) {
|
||||
cmdq->cq.cmdsn++;
|
||||
erdma_polling_cmd_completions(cmdq);
|
||||
}
|
||||
|
||||
notify_eq(&cmdq->eq);
|
||||
}
|
||||
|
||||
static int erdma_poll_cmd_completion(struct erdma_comp_wait *comp_ctx,
|
||||
struct erdma_cmdq *cmdq, u32 timeout)
|
||||
{
|
||||
unsigned long comp_timeout = jiffies + msecs_to_jiffies(timeout);
|
||||
|
||||
while (1) {
|
||||
erdma_polling_cmd_completions(cmdq);
|
||||
if (comp_ctx->cmd_status != ERDMA_CMD_STATUS_ISSUED)
|
||||
break;
|
||||
|
||||
if (time_is_before_jiffies(comp_timeout))
|
||||
return -ETIME;
|
||||
|
||||
msleep(20);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int erdma_wait_cmd_completion(struct erdma_comp_wait *comp_ctx,
|
||||
struct erdma_cmdq *cmdq, u32 timeout)
|
||||
{
|
||||
unsigned long flags = 0;
|
||||
|
||||
wait_for_completion_timeout(&comp_ctx->wait_event,
|
||||
msecs_to_jiffies(timeout));
|
||||
|
||||
if (unlikely(comp_ctx->cmd_status != ERDMA_CMD_STATUS_FINISHED)) {
|
||||
spin_lock_irqsave(&cmdq->cq.lock, flags);
|
||||
comp_ctx->cmd_status = ERDMA_CMD_STATUS_TIMEOUT;
|
||||
spin_unlock_irqrestore(&cmdq->cq.lock, flags);
|
||||
return -ETIME;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void erdma_cmdq_build_reqhdr(u64 *hdr, u32 mod, u32 op)
|
||||
{
|
||||
*hdr = FIELD_PREP(ERDMA_CMD_HDR_SUB_MOD_MASK, mod) |
|
||||
FIELD_PREP(ERDMA_CMD_HDR_OPCODE_MASK, op);
|
||||
}
|
||||
|
||||
int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, u64 *req, u32 req_size,
|
||||
u64 *resp0, u64 *resp1)
|
||||
{
|
||||
struct erdma_comp_wait *comp_wait;
|
||||
int ret;
|
||||
|
||||
if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state))
|
||||
return -ENODEV;
|
||||
|
||||
down(&cmdq->credits);
|
||||
|
||||
comp_wait = get_comp_wait(cmdq);
|
||||
if (IS_ERR(comp_wait)) {
|
||||
clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state);
|
||||
set_bit(ERDMA_CMDQ_STATE_CTX_ERR_BIT, &cmdq->state);
|
||||
up(&cmdq->credits);
|
||||
return PTR_ERR(comp_wait);
|
||||
}
|
||||
|
||||
spin_lock(&cmdq->sq.lock);
|
||||
push_cmdq_sqe(cmdq, req, req_size, comp_wait);
|
||||
spin_unlock(&cmdq->sq.lock);
|
||||
|
||||
if (cmdq->use_event)
|
||||
ret = erdma_wait_cmd_completion(comp_wait, cmdq,
|
||||
ERDMA_CMDQ_TIMEOUT_MS);
|
||||
else
|
||||
ret = erdma_poll_cmd_completion(comp_wait, cmdq,
|
||||
ERDMA_CMDQ_TIMEOUT_MS);
|
||||
|
||||
if (ret) {
|
||||
set_bit(ERDMA_CMDQ_STATE_TIMEOUT_BIT, &cmdq->state);
|
||||
clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (comp_wait->comp_status)
|
||||
ret = -EIO;
|
||||
|
||||
if (resp0 && resp1) {
|
||||
*resp0 = *((u64 *)&comp_wait->comp_data[0]);
|
||||
*resp1 = *((u64 *)&comp_wait->comp_data[2]);
|
||||
}
|
||||
put_comp_wait(cmdq, comp_wait);
|
||||
|
||||
out:
|
||||
up(&cmdq->credits);
|
||||
|
||||
return ret;
|
||||
}
|
||||
205
drivers/infiniband/hw/erdma/erdma_cq.c
Normal file
205
drivers/infiniband/hw/erdma/erdma_cq.c
Normal file
@@ -0,0 +1,205 @@
|
||||
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
|
||||
|
||||
/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
|
||||
/* Kai Shen <kaishen@linux.alibaba.com> */
|
||||
/* Copyright (c) 2020-2022, Alibaba Group. */
|
||||
|
||||
#include <rdma/ib_verbs.h>
|
||||
|
||||
#include "erdma_hw.h"
|
||||
#include "erdma_verbs.h"
|
||||
|
||||
static void *get_next_valid_cqe(struct erdma_cq *cq)
|
||||
{
|
||||
__be32 *cqe = get_queue_entry(cq->kern_cq.qbuf, cq->kern_cq.ci,
|
||||
cq->depth, CQE_SHIFT);
|
||||
u32 owner = FIELD_GET(ERDMA_CQE_HDR_OWNER_MASK,
|
||||
__be32_to_cpu(READ_ONCE(*cqe)));
|
||||
|
||||
return owner ^ !!(cq->kern_cq.ci & cq->depth) ? cqe : NULL;
|
||||
}
|
||||
|
||||
static void notify_cq(struct erdma_cq *cq, u8 solcitied)
|
||||
{
|
||||
u64 db_data =
|
||||
FIELD_PREP(ERDMA_CQDB_IDX_MASK, (cq->kern_cq.notify_cnt)) |
|
||||
FIELD_PREP(ERDMA_CQDB_CQN_MASK, cq->cqn) |
|
||||
FIELD_PREP(ERDMA_CQDB_ARM_MASK, 1) |
|
||||
FIELD_PREP(ERDMA_CQDB_SOL_MASK, solcitied) |
|
||||
FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cq->kern_cq.cmdsn) |
|
||||
FIELD_PREP(ERDMA_CQDB_CI_MASK, cq->kern_cq.ci);
|
||||
|
||||
*cq->kern_cq.db_record = db_data;
|
||||
writeq(db_data, cq->kern_cq.db);
|
||||
}
|
||||
|
||||
int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
|
||||
{
|
||||
struct erdma_cq *cq = to_ecq(ibcq);
|
||||
unsigned long irq_flags;
|
||||
int ret = 0;
|
||||
|
||||
spin_lock_irqsave(&cq->kern_cq.lock, irq_flags);
|
||||
|
||||
notify_cq(cq, (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED);
|
||||
|
||||
if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && get_next_valid_cqe(cq))
|
||||
ret = 1;
|
||||
|
||||
cq->kern_cq.notify_cnt++;
|
||||
|
||||
spin_unlock_irqrestore(&cq->kern_cq.lock, irq_flags);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const enum ib_wc_opcode wc_mapping_table[ERDMA_NUM_OPCODES] = {
|
||||
[ERDMA_OP_WRITE] = IB_WC_RDMA_WRITE,
|
||||
[ERDMA_OP_READ] = IB_WC_RDMA_READ,
|
||||
[ERDMA_OP_SEND] = IB_WC_SEND,
|
||||
[ERDMA_OP_SEND_WITH_IMM] = IB_WC_SEND,
|
||||
[ERDMA_OP_RECEIVE] = IB_WC_RECV,
|
||||
[ERDMA_OP_RECV_IMM] = IB_WC_RECV_RDMA_WITH_IMM,
|
||||
[ERDMA_OP_RECV_INV] = IB_WC_RECV,
|
||||
[ERDMA_OP_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
|
||||
[ERDMA_OP_INVALIDATE] = IB_WC_LOCAL_INV,
|
||||
[ERDMA_OP_RSP_SEND_IMM] = IB_WC_RECV,
|
||||
[ERDMA_OP_SEND_WITH_INV] = IB_WC_SEND,
|
||||
[ERDMA_OP_REG_MR] = IB_WC_REG_MR,
|
||||
[ERDMA_OP_LOCAL_INV] = IB_WC_LOCAL_INV,
|
||||
[ERDMA_OP_READ_WITH_INV] = IB_WC_RDMA_READ,
|
||||
};
|
||||
|
||||
static const struct {
|
||||
enum erdma_wc_status erdma;
|
||||
enum ib_wc_status base;
|
||||
enum erdma_vendor_err vendor;
|
||||
} map_cqe_status[ERDMA_NUM_WC_STATUS] = {
|
||||
{ ERDMA_WC_SUCCESS, IB_WC_SUCCESS, ERDMA_WC_VENDOR_NO_ERR },
|
||||
{ ERDMA_WC_GENERAL_ERR, IB_WC_GENERAL_ERR, ERDMA_WC_VENDOR_NO_ERR },
|
||||
{ ERDMA_WC_RECV_WQE_FORMAT_ERR, IB_WC_GENERAL_ERR,
|
||||
ERDMA_WC_VENDOR_INVALID_RQE },
|
||||
{ ERDMA_WC_RECV_STAG_INVALID_ERR, IB_WC_REM_ACCESS_ERR,
|
||||
ERDMA_WC_VENDOR_RQE_INVALID_STAG },
|
||||
{ ERDMA_WC_RECV_ADDR_VIOLATION_ERR, IB_WC_REM_ACCESS_ERR,
|
||||
ERDMA_WC_VENDOR_RQE_ADDR_VIOLATION },
|
||||
{ ERDMA_WC_RECV_RIGHT_VIOLATION_ERR, IB_WC_REM_ACCESS_ERR,
|
||||
ERDMA_WC_VENDOR_RQE_ACCESS_RIGHT_ERR },
|
||||
{ ERDMA_WC_RECV_PDID_ERR, IB_WC_REM_ACCESS_ERR,
|
||||
ERDMA_WC_VENDOR_RQE_INVALID_PD },
|
||||
{ ERDMA_WC_RECV_WARRPING_ERR, IB_WC_REM_ACCESS_ERR,
|
||||
ERDMA_WC_VENDOR_RQE_WRAP_ERR },
|
||||
{ ERDMA_WC_SEND_WQE_FORMAT_ERR, IB_WC_LOC_QP_OP_ERR,
|
||||
ERDMA_WC_VENDOR_INVALID_SQE },
|
||||
{ ERDMA_WC_SEND_WQE_ORD_EXCEED, IB_WC_GENERAL_ERR,
|
||||
ERDMA_WC_VENDOR_ZERO_ORD },
|
||||
{ ERDMA_WC_SEND_STAG_INVALID_ERR, IB_WC_LOC_ACCESS_ERR,
|
||||
ERDMA_WC_VENDOR_SQE_INVALID_STAG },
|
||||
{ ERDMA_WC_SEND_ADDR_VIOLATION_ERR, IB_WC_LOC_ACCESS_ERR,
|
||||
ERDMA_WC_VENDOR_SQE_ADDR_VIOLATION },
|
||||
{ ERDMA_WC_SEND_RIGHT_VIOLATION_ERR, IB_WC_LOC_ACCESS_ERR,
|
||||
ERDMA_WC_VENDOR_SQE_ACCESS_ERR },
|
||||
{ ERDMA_WC_SEND_PDID_ERR, IB_WC_LOC_ACCESS_ERR,
|
||||
ERDMA_WC_VENDOR_SQE_INVALID_PD },
|
||||
{ ERDMA_WC_SEND_WARRPING_ERR, IB_WC_LOC_ACCESS_ERR,
|
||||
ERDMA_WC_VENDOR_SQE_WARP_ERR },
|
||||
{ ERDMA_WC_FLUSH_ERR, IB_WC_WR_FLUSH_ERR, ERDMA_WC_VENDOR_NO_ERR },
|
||||
{ ERDMA_WC_RETRY_EXC_ERR, IB_WC_RETRY_EXC_ERR, ERDMA_WC_VENDOR_NO_ERR },
|
||||
};
|
||||
|
||||
#define ERDMA_POLLCQ_NO_QP 1
|
||||
|
||||
static int erdma_poll_one_cqe(struct erdma_cq *cq, struct ib_wc *wc)
|
||||
{
|
||||
struct erdma_dev *dev = to_edev(cq->ibcq.device);
|
||||
u8 opcode, syndrome, qtype;
|
||||
struct erdma_kqp *kern_qp;
|
||||
struct erdma_cqe *cqe;
|
||||
struct erdma_qp *qp;
|
||||
u16 wqe_idx, depth;
|
||||
u32 qpn, cqe_hdr;
|
||||
u64 *id_table;
|
||||
u64 *wqe_hdr;
|
||||
|
||||
cqe = get_next_valid_cqe(cq);
|
||||
if (!cqe)
|
||||
return -EAGAIN;
|
||||
|
||||
cq->kern_cq.ci++;
|
||||
|
||||
/* cqbuf should be ready when we poll */
|
||||
dma_rmb();
|
||||
|
||||
qpn = be32_to_cpu(cqe->qpn);
|
||||
wqe_idx = be32_to_cpu(cqe->qe_idx);
|
||||
cqe_hdr = be32_to_cpu(cqe->hdr);
|
||||
|
||||
qp = find_qp_by_qpn(dev, qpn);
|
||||
if (!qp)
|
||||
return ERDMA_POLLCQ_NO_QP;
|
||||
|
||||
kern_qp = &qp->kern_qp;
|
||||
|
||||
qtype = FIELD_GET(ERDMA_CQE_HDR_QTYPE_MASK, cqe_hdr);
|
||||
syndrome = FIELD_GET(ERDMA_CQE_HDR_SYNDROME_MASK, cqe_hdr);
|
||||
opcode = FIELD_GET(ERDMA_CQE_HDR_OPCODE_MASK, cqe_hdr);
|
||||
|
||||
if (qtype == ERDMA_CQE_QTYPE_SQ) {
|
||||
id_table = kern_qp->swr_tbl;
|
||||
depth = qp->attrs.sq_size;
|
||||
wqe_hdr = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx,
|
||||
qp->attrs.sq_size, SQEBB_SHIFT);
|
||||
kern_qp->sq_ci =
|
||||
FIELD_GET(ERDMA_SQE_HDR_WQEBB_CNT_MASK, *wqe_hdr) +
|
||||
wqe_idx + 1;
|
||||
} else {
|
||||
id_table = kern_qp->rwr_tbl;
|
||||
depth = qp->attrs.rq_size;
|
||||
}
|
||||
wc->wr_id = id_table[wqe_idx & (depth - 1)];
|
||||
wc->byte_len = be32_to_cpu(cqe->size);
|
||||
|
||||
wc->wc_flags = 0;
|
||||
|
||||
wc->opcode = wc_mapping_table[opcode];
|
||||
if (opcode == ERDMA_OP_RECV_IMM || opcode == ERDMA_OP_RSP_SEND_IMM) {
|
||||
wc->ex.imm_data = cpu_to_be32(le32_to_cpu(cqe->imm_data));
|
||||
wc->wc_flags |= IB_WC_WITH_IMM;
|
||||
} else if (opcode == ERDMA_OP_RECV_INV) {
|
||||
wc->ex.invalidate_rkey = be32_to_cpu(cqe->inv_rkey);
|
||||
wc->wc_flags |= IB_WC_WITH_INVALIDATE;
|
||||
}
|
||||
|
||||
if (syndrome >= ERDMA_NUM_WC_STATUS)
|
||||
syndrome = ERDMA_WC_GENERAL_ERR;
|
||||
|
||||
wc->status = map_cqe_status[syndrome].base;
|
||||
wc->vendor_err = map_cqe_status[syndrome].vendor;
|
||||
wc->qp = &qp->ibqp;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
|
||||
{
|
||||
struct erdma_cq *cq = to_ecq(ibcq);
|
||||
unsigned long flags;
|
||||
int npolled, ret;
|
||||
|
||||
spin_lock_irqsave(&cq->kern_cq.lock, flags);
|
||||
|
||||
for (npolled = 0; npolled < num_entries;) {
|
||||
ret = erdma_poll_one_cqe(cq, wc + npolled);
|
||||
|
||||
if (ret == -EAGAIN) /* no received new CQEs. */
|
||||
break;
|
||||
else if (ret) /* ignore invalid CQEs. */
|
||||
continue;
|
||||
|
||||
npolled++;
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&cq->kern_cq.lock, flags);
|
||||
|
||||
return npolled;
|
||||
}
|
||||
329
drivers/infiniband/hw/erdma/erdma_eq.c
Normal file
329
drivers/infiniband/hw/erdma/erdma_eq.c
Normal file
@@ -0,0 +1,329 @@
|
||||
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
|
||||
|
||||
/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
|
||||
/* Kai Shen <kaishen@linux.alibaba.com> */
|
||||
/* Copyright (c) 2020-2022, Alibaba Group. */
|
||||
|
||||
#include <linux/errno.h>
|
||||
#include <linux/pci.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
#include "erdma.h"
|
||||
#include "erdma_hw.h"
|
||||
#include "erdma_verbs.h"
|
||||
|
||||
#define MAX_POLL_CHUNK_SIZE 16
|
||||
|
||||
void notify_eq(struct erdma_eq *eq)
|
||||
{
|
||||
u64 db_data = FIELD_PREP(ERDMA_EQDB_CI_MASK, eq->ci) |
|
||||
FIELD_PREP(ERDMA_EQDB_ARM_MASK, 1);
|
||||
|
||||
*eq->db_record = db_data;
|
||||
writeq(db_data, eq->db_addr);
|
||||
|
||||
atomic64_inc(&eq->notify_num);
|
||||
}
|
||||
|
||||
void *get_next_valid_eqe(struct erdma_eq *eq)
|
||||
{
|
||||
u64 *eqe = get_queue_entry(eq->qbuf, eq->ci, eq->depth, EQE_SHIFT);
|
||||
u32 owner = FIELD_GET(ERDMA_CEQE_HDR_O_MASK, READ_ONCE(*eqe));
|
||||
|
||||
return owner ^ !!(eq->ci & eq->depth) ? eqe : NULL;
|
||||
}
|
||||
|
||||
void erdma_aeq_event_handler(struct erdma_dev *dev)
|
||||
{
|
||||
struct erdma_aeqe *aeqe;
|
||||
u32 cqn, qpn;
|
||||
struct erdma_qp *qp;
|
||||
struct erdma_cq *cq;
|
||||
struct ib_event event;
|
||||
u32 poll_cnt = 0;
|
||||
|
||||
memset(&event, 0, sizeof(event));
|
||||
|
||||
while (poll_cnt < MAX_POLL_CHUNK_SIZE) {
|
||||
aeqe = get_next_valid_eqe(&dev->aeq);
|
||||
if (!aeqe)
|
||||
break;
|
||||
|
||||
dma_rmb();
|
||||
|
||||
dev->aeq.ci++;
|
||||
atomic64_inc(&dev->aeq.event_num);
|
||||
poll_cnt++;
|
||||
|
||||
if (FIELD_GET(ERDMA_AEQE_HDR_TYPE_MASK,
|
||||
le32_to_cpu(aeqe->hdr)) == ERDMA_AE_TYPE_CQ_ERR) {
|
||||
cqn = le32_to_cpu(aeqe->event_data0);
|
||||
cq = find_cq_by_cqn(dev, cqn);
|
||||
if (!cq)
|
||||
continue;
|
||||
|
||||
event.device = cq->ibcq.device;
|
||||
event.element.cq = &cq->ibcq;
|
||||
event.event = IB_EVENT_CQ_ERR;
|
||||
if (cq->ibcq.event_handler)
|
||||
cq->ibcq.event_handler(&event,
|
||||
cq->ibcq.cq_context);
|
||||
} else {
|
||||
qpn = le32_to_cpu(aeqe->event_data0);
|
||||
qp = find_qp_by_qpn(dev, qpn);
|
||||
if (!qp)
|
||||
continue;
|
||||
|
||||
event.device = qp->ibqp.device;
|
||||
event.element.qp = &qp->ibqp;
|
||||
event.event = IB_EVENT_QP_FATAL;
|
||||
if (qp->ibqp.event_handler)
|
||||
qp->ibqp.event_handler(&event,
|
||||
qp->ibqp.qp_context);
|
||||
}
|
||||
}
|
||||
|
||||
notify_eq(&dev->aeq);
|
||||
}
|
||||
|
||||
int erdma_aeq_init(struct erdma_dev *dev)
|
||||
{
|
||||
struct erdma_eq *eq = &dev->aeq;
|
||||
u32 buf_size;
|
||||
|
||||
eq->depth = ERDMA_DEFAULT_EQ_DEPTH;
|
||||
buf_size = eq->depth << EQE_SHIFT;
|
||||
|
||||
eq->qbuf =
|
||||
dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size),
|
||||
&eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO);
|
||||
if (!eq->qbuf)
|
||||
return -ENOMEM;
|
||||
|
||||
spin_lock_init(&eq->lock);
|
||||
atomic64_set(&eq->event_num, 0);
|
||||
atomic64_set(&eq->notify_num, 0);
|
||||
|
||||
eq->db_addr = (u64 __iomem *)(dev->func_bar + ERDMA_REGS_AEQ_DB_REG);
|
||||
eq->db_record = (u64 *)(eq->qbuf + buf_size);
|
||||
|
||||
erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_H_REG,
|
||||
upper_32_bits(eq->qbuf_dma_addr));
|
||||
erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_L_REG,
|
||||
lower_32_bits(eq->qbuf_dma_addr));
|
||||
erdma_reg_write32(dev, ERDMA_REGS_AEQ_DEPTH_REG, eq->depth);
|
||||
erdma_reg_write64(dev, ERDMA_AEQ_DB_HOST_ADDR_REG,
|
||||
eq->qbuf_dma_addr + buf_size);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void erdma_aeq_destroy(struct erdma_dev *dev)
|
||||
{
|
||||
struct erdma_eq *eq = &dev->aeq;
|
||||
|
||||
dma_free_coherent(&dev->pdev->dev,
|
||||
WARPPED_BUFSIZE(eq->depth << EQE_SHIFT), eq->qbuf,
|
||||
eq->qbuf_dma_addr);
|
||||
}
|
||||
|
||||
void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb)
|
||||
{
|
||||
struct erdma_dev *dev = ceq_cb->dev;
|
||||
struct erdma_cq *cq;
|
||||
u32 poll_cnt = 0;
|
||||
u64 *ceqe;
|
||||
int cqn;
|
||||
|
||||
if (!ceq_cb->ready)
|
||||
return;
|
||||
|
||||
while (poll_cnt < MAX_POLL_CHUNK_SIZE) {
|
||||
ceqe = get_next_valid_eqe(&ceq_cb->eq);
|
||||
if (!ceqe)
|
||||
break;
|
||||
|
||||
dma_rmb();
|
||||
ceq_cb->eq.ci++;
|
||||
poll_cnt++;
|
||||
cqn = FIELD_GET(ERDMA_CEQE_HDR_CQN_MASK, READ_ONCE(*ceqe));
|
||||
|
||||
cq = find_cq_by_cqn(dev, cqn);
|
||||
if (!cq)
|
||||
continue;
|
||||
|
||||
if (rdma_is_kernel_res(&cq->ibcq.res))
|
||||
cq->kern_cq.cmdsn++;
|
||||
|
||||
if (cq->ibcq.comp_handler)
|
||||
cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
|
||||
}
|
||||
|
||||
notify_eq(&ceq_cb->eq);
|
||||
}
|
||||
|
||||
static irqreturn_t erdma_intr_ceq_handler(int irq, void *data)
|
||||
{
|
||||
struct erdma_eq_cb *ceq_cb = data;
|
||||
|
||||
tasklet_schedule(&ceq_cb->tasklet);
|
||||
|
||||
return IRQ_HANDLED;
|
||||
}
|
||||
|
||||
static void erdma_intr_ceq_task(unsigned long data)
|
||||
{
|
||||
erdma_ceq_completion_handler((struct erdma_eq_cb *)data);
|
||||
}
|
||||
|
||||
static int erdma_set_ceq_irq(struct erdma_dev *dev, u16 ceqn)
|
||||
{
|
||||
struct erdma_eq_cb *eqc = &dev->ceqs[ceqn];
|
||||
int err;
|
||||
|
||||
snprintf(eqc->irq.name, ERDMA_IRQNAME_SIZE, "erdma-ceq%u@pci:%s", ceqn,
|
||||
pci_name(dev->pdev));
|
||||
eqc->irq.msix_vector = pci_irq_vector(dev->pdev, ceqn + 1);
|
||||
|
||||
tasklet_init(&dev->ceqs[ceqn].tasklet, erdma_intr_ceq_task,
|
||||
(unsigned long)&dev->ceqs[ceqn]);
|
||||
|
||||
cpumask_set_cpu(cpumask_local_spread(ceqn + 1, dev->attrs.numa_node),
|
||||
&eqc->irq.affinity_hint_mask);
|
||||
|
||||
err = request_irq(eqc->irq.msix_vector, erdma_intr_ceq_handler, 0,
|
||||
eqc->irq.name, eqc);
|
||||
if (err) {
|
||||
dev_err(&dev->pdev->dev, "failed to request_irq(%d)\n", err);
|
||||
return err;
|
||||
}
|
||||
|
||||
irq_set_affinity_hint(eqc->irq.msix_vector,
|
||||
&eqc->irq.affinity_hint_mask);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void erdma_free_ceq_irq(struct erdma_dev *dev, u16 ceqn)
|
||||
{
|
||||
struct erdma_eq_cb *eqc = &dev->ceqs[ceqn];
|
||||
|
||||
irq_set_affinity_hint(eqc->irq.msix_vector, NULL);
|
||||
free_irq(eqc->irq.msix_vector, eqc);
|
||||
}
|
||||
|
||||
static int create_eq_cmd(struct erdma_dev *dev, u32 eqn, struct erdma_eq *eq)
|
||||
{
|
||||
struct erdma_cmdq_create_eq_req req;
|
||||
dma_addr_t db_info_dma_addr;
|
||||
|
||||
erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON,
|
||||
CMDQ_OPCODE_CREATE_EQ);
|
||||
req.eqn = eqn;
|
||||
req.depth = ilog2(eq->depth);
|
||||
req.qbuf_addr = eq->qbuf_dma_addr;
|
||||
req.qtype = ERDMA_EQ_TYPE_CEQ;
|
||||
/* Vector index is the same as EQN. */
|
||||
req.vector_idx = eqn;
|
||||
db_info_dma_addr = eq->qbuf_dma_addr + (eq->depth << EQE_SHIFT);
|
||||
req.db_dma_addr_l = lower_32_bits(db_info_dma_addr);
|
||||
req.db_dma_addr_h = upper_32_bits(db_info_dma_addr);
|
||||
|
||||
return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req,
|
||||
sizeof(struct erdma_cmdq_create_eq_req),
|
||||
NULL, NULL);
|
||||
}
|
||||
|
||||
static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn)
|
||||
{
|
||||
struct erdma_eq *eq = &dev->ceqs[ceqn].eq;
|
||||
u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT;
|
||||
int ret;
|
||||
|
||||
eq->qbuf =
|
||||
dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size),
|
||||
&eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO);
|
||||
if (!eq->qbuf)
|
||||
return -ENOMEM;
|
||||
|
||||
spin_lock_init(&eq->lock);
|
||||
atomic64_set(&eq->event_num, 0);
|
||||
atomic64_set(&eq->notify_num, 0);
|
||||
|
||||
eq->depth = ERDMA_DEFAULT_EQ_DEPTH;
|
||||
eq->db_addr =
|
||||
(u64 __iomem *)(dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG +
|
||||
(ceqn + 1) * ERDMA_DB_SIZE);
|
||||
eq->db_record = (u64 *)(eq->qbuf + buf_size);
|
||||
eq->ci = 0;
|
||||
dev->ceqs[ceqn].dev = dev;
|
||||
|
||||
/* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */
|
||||
ret = create_eq_cmd(dev, ceqn + 1, eq);
|
||||
dev->ceqs[ceqn].ready = ret ? false : true;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn)
|
||||
{
|
||||
struct erdma_eq *eq = &dev->ceqs[ceqn].eq;
|
||||
u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT;
|
||||
struct erdma_cmdq_destroy_eq_req req;
|
||||
int err;
|
||||
|
||||
dev->ceqs[ceqn].ready = 0;
|
||||
|
||||
erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON,
|
||||
CMDQ_OPCODE_DESTROY_EQ);
|
||||
/* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */
|
||||
req.eqn = ceqn + 1;
|
||||
req.qtype = ERDMA_EQ_TYPE_CEQ;
|
||||
req.vector_idx = ceqn + 1;
|
||||
|
||||
err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL,
|
||||
NULL);
|
||||
if (err)
|
||||
return;
|
||||
|
||||
dma_free_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), eq->qbuf,
|
||||
eq->qbuf_dma_addr);
|
||||
}
|
||||
|
||||
int erdma_ceqs_init(struct erdma_dev *dev)
|
||||
{
|
||||
u32 i, j;
|
||||
int err;
|
||||
|
||||
for (i = 0; i < dev->attrs.irq_num - 1; i++) {
|
||||
err = erdma_ceq_init_one(dev, i);
|
||||
if (err)
|
||||
goto out_err;
|
||||
|
||||
err = erdma_set_ceq_irq(dev, i);
|
||||
if (err) {
|
||||
erdma_ceq_uninit_one(dev, i);
|
||||
goto out_err;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
out_err:
|
||||
for (j = 0; j < i; j++) {
|
||||
erdma_free_ceq_irq(dev, j);
|
||||
erdma_ceq_uninit_one(dev, j);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
void erdma_ceqs_uninit(struct erdma_dev *dev)
|
||||
{
|
||||
u32 i;
|
||||
|
||||
for (i = 0; i < dev->attrs.irq_num - 1; i++) {
|
||||
erdma_free_ceq_irq(dev, i);
|
||||
erdma_ceq_uninit_one(dev, i);
|
||||
}
|
||||
}
|
||||
508
drivers/infiniband/hw/erdma/erdma_hw.h
Normal file
508
drivers/infiniband/hw/erdma/erdma_hw.h
Normal file
@@ -0,0 +1,508 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
|
||||
|
||||
/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
|
||||
/* Kai Shen <kaishen@linux.alibaba.com> */
|
||||
/* Copyright (c) 2020-2022, Alibaba Group. */
|
||||
|
||||
#ifndef __ERDMA_HW_H__
|
||||
#define __ERDMA_HW_H__
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
/* PCIe device related definition. */
|
||||
#define PCI_VENDOR_ID_ALIBABA 0x1ded
|
||||
|
||||
#define ERDMA_PCI_WIDTH 64
|
||||
#define ERDMA_FUNC_BAR 0
|
||||
#define ERDMA_MISX_BAR 2
|
||||
|
||||
#define ERDMA_BAR_MASK (BIT(ERDMA_FUNC_BAR) | BIT(ERDMA_MISX_BAR))
|
||||
|
||||
/* MSI-X related. */
|
||||
#define ERDMA_NUM_MSIX_VEC 32U
|
||||
#define ERDMA_MSIX_VECTOR_CMDQ 0
|
||||
|
||||
/* PCIe Bar0 Registers. */
|
||||
#define ERDMA_REGS_VERSION_REG 0x0
|
||||
#define ERDMA_REGS_DEV_CTRL_REG 0x10
|
||||
#define ERDMA_REGS_DEV_ST_REG 0x14
|
||||
#define ERDMA_REGS_NETDEV_MAC_L_REG 0x18
|
||||
#define ERDMA_REGS_NETDEV_MAC_H_REG 0x1C
|
||||
#define ERDMA_REGS_CMDQ_SQ_ADDR_L_REG 0x20
|
||||
#define ERDMA_REGS_CMDQ_SQ_ADDR_H_REG 0x24
|
||||
#define ERDMA_REGS_CMDQ_CQ_ADDR_L_REG 0x28
|
||||
#define ERDMA_REGS_CMDQ_CQ_ADDR_H_REG 0x2C
|
||||
#define ERDMA_REGS_CMDQ_DEPTH_REG 0x30
|
||||
#define ERDMA_REGS_CMDQ_EQ_DEPTH_REG 0x34
|
||||
#define ERDMA_REGS_CMDQ_EQ_ADDR_L_REG 0x38
|
||||
#define ERDMA_REGS_CMDQ_EQ_ADDR_H_REG 0x3C
|
||||
#define ERDMA_REGS_AEQ_ADDR_L_REG 0x40
|
||||
#define ERDMA_REGS_AEQ_ADDR_H_REG 0x44
|
||||
#define ERDMA_REGS_AEQ_DEPTH_REG 0x48
|
||||
#define ERDMA_REGS_GRP_NUM_REG 0x4c
|
||||
#define ERDMA_REGS_AEQ_DB_REG 0x50
|
||||
#define ERDMA_CMDQ_SQ_DB_HOST_ADDR_REG 0x60
|
||||
#define ERDMA_CMDQ_CQ_DB_HOST_ADDR_REG 0x68
|
||||
#define ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG 0x70
|
||||
#define ERDMA_AEQ_DB_HOST_ADDR_REG 0x78
|
||||
#define ERDMA_REGS_STATS_TSO_IN_PKTS_REG 0x80
|
||||
#define ERDMA_REGS_STATS_TSO_OUT_PKTS_REG 0x88
|
||||
#define ERDMA_REGS_STATS_TSO_OUT_BYTES_REG 0x90
|
||||
#define ERDMA_REGS_STATS_TX_DROP_PKTS_REG 0x98
|
||||
#define ERDMA_REGS_STATS_TX_BPS_METER_DROP_PKTS_REG 0xa0
|
||||
#define ERDMA_REGS_STATS_TX_PPS_METER_DROP_PKTS_REG 0xa8
|
||||
#define ERDMA_REGS_STATS_RX_PKTS_REG 0xc0
|
||||
#define ERDMA_REGS_STATS_RX_BYTES_REG 0xc8
|
||||
#define ERDMA_REGS_STATS_RX_DROP_PKTS_REG 0xd0
|
||||
#define ERDMA_REGS_STATS_RX_BPS_METER_DROP_PKTS_REG 0xd8
|
||||
#define ERDMA_REGS_STATS_RX_PPS_METER_DROP_PKTS_REG 0xe0
|
||||
#define ERDMA_REGS_CEQ_DB_BASE_REG 0x100
|
||||
#define ERDMA_CMDQ_SQDB_REG 0x200
|
||||
#define ERDMA_CMDQ_CQDB_REG 0x300
|
||||
|
||||
/* DEV_CTRL_REG details. */
|
||||
#define ERDMA_REG_DEV_CTRL_RESET_MASK 0x00000001
|
||||
#define ERDMA_REG_DEV_CTRL_INIT_MASK 0x00000002
|
||||
|
||||
/* DEV_ST_REG details. */
|
||||
#define ERDMA_REG_DEV_ST_RESET_DONE_MASK 0x00000001U
|
||||
#define ERDMA_REG_DEV_ST_INIT_DONE_MASK 0x00000002U
|
||||
|
||||
/* eRDMA PCIe DBs definition. */
|
||||
#define ERDMA_BAR_DB_SPACE_BASE 4096
|
||||
|
||||
#define ERDMA_BAR_SQDB_SPACE_OFFSET ERDMA_BAR_DB_SPACE_BASE
|
||||
#define ERDMA_BAR_SQDB_SPACE_SIZE (384 * 1024)
|
||||
|
||||
#define ERDMA_BAR_RQDB_SPACE_OFFSET \
|
||||
(ERDMA_BAR_SQDB_SPACE_OFFSET + ERDMA_BAR_SQDB_SPACE_SIZE)
|
||||
#define ERDMA_BAR_RQDB_SPACE_SIZE (96 * 1024)
|
||||
|
||||
#define ERDMA_BAR_CQDB_SPACE_OFFSET \
|
||||
(ERDMA_BAR_RQDB_SPACE_OFFSET + ERDMA_BAR_RQDB_SPACE_SIZE)
|
||||
|
||||
/* Doorbell page resources related. */
|
||||
/*
|
||||
* Max # of parallelly issued directSQE is 3072 per device,
|
||||
* hardware organizes this into 24 group, per group has 128 credits.
|
||||
*/
|
||||
#define ERDMA_DWQE_MAX_GRP_CNT 24
|
||||
#define ERDMA_DWQE_NUM_PER_GRP 128
|
||||
|
||||
#define ERDMA_DWQE_TYPE0_CNT 64
|
||||
#define ERDMA_DWQE_TYPE1_CNT 496
|
||||
/* type1 DB contains 2 DBs, takes 256Byte. */
|
||||
#define ERDMA_DWQE_TYPE1_CNT_PER_PAGE 16
|
||||
|
||||
#define ERDMA_SDB_SHARED_PAGE_INDEX 95
|
||||
|
||||
/* Doorbell related. */
|
||||
#define ERDMA_DB_SIZE 8
|
||||
|
||||
#define ERDMA_CQDB_IDX_MASK GENMASK_ULL(63, 56)
|
||||
#define ERDMA_CQDB_CQN_MASK GENMASK_ULL(55, 32)
|
||||
#define ERDMA_CQDB_ARM_MASK BIT_ULL(31)
|
||||
#define ERDMA_CQDB_SOL_MASK BIT_ULL(30)
|
||||
#define ERDMA_CQDB_CMDSN_MASK GENMASK_ULL(29, 28)
|
||||
#define ERDMA_CQDB_CI_MASK GENMASK_ULL(23, 0)
|
||||
|
||||
#define ERDMA_EQDB_ARM_MASK BIT(31)
|
||||
#define ERDMA_EQDB_CI_MASK GENMASK_ULL(23, 0)
|
||||
|
||||
#define ERDMA_PAGE_SIZE_SUPPORT 0x7FFFF000
|
||||
|
||||
/* WQE related. */
|
||||
#define EQE_SIZE 16
|
||||
#define EQE_SHIFT 4
|
||||
#define RQE_SIZE 32
|
||||
#define RQE_SHIFT 5
|
||||
#define CQE_SIZE 32
|
||||
#define CQE_SHIFT 5
|
||||
#define SQEBB_SIZE 32
|
||||
#define SQEBB_SHIFT 5
|
||||
#define SQEBB_MASK (~(SQEBB_SIZE - 1))
|
||||
#define SQEBB_ALIGN(size) ((size + SQEBB_SIZE - 1) & SQEBB_MASK)
|
||||
#define SQEBB_COUNT(size) (SQEBB_ALIGN(size) >> SQEBB_SHIFT)
|
||||
|
||||
#define ERDMA_MAX_SQE_SIZE 128
|
||||
#define ERDMA_MAX_WQEBB_PER_SQE 4
|
||||
|
||||
/* CMDQ related. */
|
||||
#define ERDMA_CMDQ_MAX_OUTSTANDING 128
|
||||
#define ERDMA_CMDQ_SQE_SIZE 64
|
||||
|
||||
/* cmdq sub module definition. */
|
||||
enum CMDQ_WQE_SUB_MOD {
|
||||
CMDQ_SUBMOD_RDMA = 0,
|
||||
CMDQ_SUBMOD_COMMON = 1
|
||||
};
|
||||
|
||||
enum CMDQ_RDMA_OPCODE {
|
||||
CMDQ_OPCODE_QUERY_DEVICE = 0,
|
||||
CMDQ_OPCODE_CREATE_QP = 1,
|
||||
CMDQ_OPCODE_DESTROY_QP = 2,
|
||||
CMDQ_OPCODE_MODIFY_QP = 3,
|
||||
CMDQ_OPCODE_CREATE_CQ = 4,
|
||||
CMDQ_OPCODE_DESTROY_CQ = 5,
|
||||
CMDQ_OPCODE_REG_MR = 8,
|
||||
CMDQ_OPCODE_DEREG_MR = 9
|
||||
};
|
||||
|
||||
enum CMDQ_COMMON_OPCODE {
|
||||
CMDQ_OPCODE_CREATE_EQ = 0,
|
||||
CMDQ_OPCODE_DESTROY_EQ = 1,
|
||||
CMDQ_OPCODE_QUERY_FW_INFO = 2,
|
||||
};
|
||||
|
||||
/* cmdq-SQE HDR */
|
||||
#define ERDMA_CMD_HDR_WQEBB_CNT_MASK GENMASK_ULL(54, 52)
|
||||
#define ERDMA_CMD_HDR_CONTEXT_COOKIE_MASK GENMASK_ULL(47, 32)
|
||||
#define ERDMA_CMD_HDR_SUB_MOD_MASK GENMASK_ULL(25, 24)
|
||||
#define ERDMA_CMD_HDR_OPCODE_MASK GENMASK_ULL(23, 16)
|
||||
#define ERDMA_CMD_HDR_WQEBB_INDEX_MASK GENMASK_ULL(15, 0)
|
||||
|
||||
struct erdma_cmdq_destroy_cq_req {
|
||||
u64 hdr;
|
||||
u32 cqn;
|
||||
};
|
||||
|
||||
#define ERDMA_EQ_TYPE_AEQ 0
|
||||
#define ERDMA_EQ_TYPE_CEQ 1
|
||||
|
||||
struct erdma_cmdq_create_eq_req {
|
||||
u64 hdr;
|
||||
u64 qbuf_addr;
|
||||
u8 vector_idx;
|
||||
u8 eqn;
|
||||
u8 depth;
|
||||
u8 qtype;
|
||||
u32 db_dma_addr_l;
|
||||
u32 db_dma_addr_h;
|
||||
};
|
||||
|
||||
struct erdma_cmdq_destroy_eq_req {
|
||||
u64 hdr;
|
||||
u64 rsvd0;
|
||||
u8 vector_idx;
|
||||
u8 eqn;
|
||||
u8 rsvd1;
|
||||
u8 qtype;
|
||||
};
|
||||
|
||||
/* create_cq cfg0 */
|
||||
#define ERDMA_CMD_CREATE_CQ_DEPTH_MASK GENMASK(31, 24)
|
||||
#define ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK GENMASK(23, 20)
|
||||
#define ERDMA_CMD_CREATE_CQ_CQN_MASK GENMASK(19, 0)
|
||||
|
||||
/* create_cq cfg1 */
|
||||
#define ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK GENMASK(31, 16)
|
||||
#define ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK BIT(15)
|
||||
#define ERDMA_CMD_CREATE_CQ_EQN_MASK GENMASK(9, 0)
|
||||
|
||||
struct erdma_cmdq_create_cq_req {
|
||||
u64 hdr;
|
||||
u32 cfg0;
|
||||
u32 qbuf_addr_l;
|
||||
u32 qbuf_addr_h;
|
||||
u32 cfg1;
|
||||
u64 cq_db_info_addr;
|
||||
u32 first_page_offset;
|
||||
};
|
||||
|
||||
/* regmr/deregmr cfg0 */
|
||||
#define ERDMA_CMD_MR_VALID_MASK BIT(31)
|
||||
#define ERDMA_CMD_MR_KEY_MASK GENMASK(27, 20)
|
||||
#define ERDMA_CMD_MR_MPT_IDX_MASK GENMASK(19, 0)
|
||||
|
||||
/* regmr cfg1 */
|
||||
#define ERDMA_CMD_REGMR_PD_MASK GENMASK(31, 12)
|
||||
#define ERDMA_CMD_REGMR_TYPE_MASK GENMASK(7, 6)
|
||||
#define ERDMA_CMD_REGMR_RIGHT_MASK GENMASK(5, 2)
|
||||
#define ERDMA_CMD_REGMR_ACC_MODE_MASK GENMASK(1, 0)
|
||||
|
||||
/* regmr cfg2 */
|
||||
#define ERDMA_CMD_REGMR_PAGESIZE_MASK GENMASK(31, 27)
|
||||
#define ERDMA_CMD_REGMR_MTT_TYPE_MASK GENMASK(21, 20)
|
||||
#define ERDMA_CMD_REGMR_MTT_CNT_MASK GENMASK(19, 0)
|
||||
|
||||
struct erdma_cmdq_reg_mr_req {
|
||||
u64 hdr;
|
||||
u32 cfg0;
|
||||
u32 cfg1;
|
||||
u64 start_va;
|
||||
u32 size;
|
||||
u32 cfg2;
|
||||
u64 phy_addr[4];
|
||||
};
|
||||
|
||||
struct erdma_cmdq_dereg_mr_req {
|
||||
u64 hdr;
|
||||
u32 cfg;
|
||||
};
|
||||
|
||||
/* modify qp cfg */
|
||||
#define ERDMA_CMD_MODIFY_QP_STATE_MASK GENMASK(31, 24)
|
||||
#define ERDMA_CMD_MODIFY_QP_CC_MASK GENMASK(23, 20)
|
||||
#define ERDMA_CMD_MODIFY_QP_QPN_MASK GENMASK(19, 0)
|
||||
|
||||
struct erdma_cmdq_modify_qp_req {
|
||||
u64 hdr;
|
||||
u32 cfg;
|
||||
u32 cookie;
|
||||
__be32 dip;
|
||||
__be32 sip;
|
||||
__be16 sport;
|
||||
__be16 dport;
|
||||
u32 send_nxt;
|
||||
u32 recv_nxt;
|
||||
};
|
||||
|
||||
/* create qp cfg0 */
|
||||
#define ERDMA_CMD_CREATE_QP_SQ_DEPTH_MASK GENMASK(31, 20)
|
||||
#define ERDMA_CMD_CREATE_QP_QPN_MASK GENMASK(19, 0)
|
||||
|
||||
/* create qp cfg1 */
|
||||
#define ERDMA_CMD_CREATE_QP_RQ_DEPTH_MASK GENMASK(31, 20)
|
||||
#define ERDMA_CMD_CREATE_QP_PD_MASK GENMASK(19, 0)
|
||||
|
||||
/* create qp cqn_mtt_cfg */
|
||||
#define ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK GENMASK(31, 28)
|
||||
#define ERDMA_CMD_CREATE_QP_CQN_MASK GENMASK(23, 0)
|
||||
|
||||
/* create qp mtt_cfg */
|
||||
#define ERDMA_CMD_CREATE_QP_PAGE_OFFSET_MASK GENMASK(31, 12)
|
||||
#define ERDMA_CMD_CREATE_QP_MTT_CNT_MASK GENMASK(11, 1)
|
||||
#define ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK BIT(0)
|
||||
|
||||
#define ERDMA_CMDQ_CREATE_QP_RESP_COOKIE_MASK GENMASK_ULL(31, 0)
|
||||
|
||||
struct erdma_cmdq_create_qp_req {
|
||||
u64 hdr;
|
||||
u32 cfg0;
|
||||
u32 cfg1;
|
||||
u32 sq_cqn_mtt_cfg;
|
||||
u32 rq_cqn_mtt_cfg;
|
||||
u64 sq_buf_addr;
|
||||
u64 rq_buf_addr;
|
||||
u32 sq_mtt_cfg;
|
||||
u32 rq_mtt_cfg;
|
||||
u64 sq_db_info_dma_addr;
|
||||
u64 rq_db_info_dma_addr;
|
||||
};
|
||||
|
||||
struct erdma_cmdq_destroy_qp_req {
|
||||
u64 hdr;
|
||||
u32 qpn;
|
||||
};
|
||||
|
||||
/* cap qword 0 definition */
|
||||
#define ERDMA_CMD_DEV_CAP_MAX_CQE_MASK GENMASK_ULL(47, 40)
|
||||
#define ERDMA_CMD_DEV_CAP_MAX_RECV_WR_MASK GENMASK_ULL(23, 16)
|
||||
#define ERDMA_CMD_DEV_CAP_MAX_MR_SIZE_MASK GENMASK_ULL(7, 0)
|
||||
|
||||
/* cap qword 1 definition */
|
||||
#define ERDMA_CMD_DEV_CAP_DMA_LOCAL_KEY_MASK GENMASK_ULL(63, 32)
|
||||
#define ERDMA_CMD_DEV_CAP_DEFAULT_CC_MASK GENMASK_ULL(31, 28)
|
||||
#define ERDMA_CMD_DEV_CAP_QBLOCK_MASK GENMASK_ULL(27, 16)
|
||||
#define ERDMA_CMD_DEV_CAP_MAX_MW_MASK GENMASK_ULL(7, 0)
|
||||
|
||||
#define ERDMA_NQP_PER_QBLOCK 1024
|
||||
|
||||
#define ERDMA_CMD_INFO0_FW_VER_MASK GENMASK_ULL(31, 0)
|
||||
|
||||
/* CQE hdr */
|
||||
#define ERDMA_CQE_HDR_OWNER_MASK BIT(31)
|
||||
#define ERDMA_CQE_HDR_OPCODE_MASK GENMASK(23, 16)
|
||||
#define ERDMA_CQE_HDR_QTYPE_MASK GENMASK(15, 8)
|
||||
#define ERDMA_CQE_HDR_SYNDROME_MASK GENMASK(7, 0)
|
||||
|
||||
#define ERDMA_CQE_QTYPE_SQ 0
|
||||
#define ERDMA_CQE_QTYPE_RQ 1
|
||||
#define ERDMA_CQE_QTYPE_CMDQ 2
|
||||
|
||||
struct erdma_cqe {
|
||||
__be32 hdr;
|
||||
__be32 qe_idx;
|
||||
__be32 qpn;
|
||||
union {
|
||||
__le32 imm_data;
|
||||
__be32 inv_rkey;
|
||||
};
|
||||
__be32 size;
|
||||
__be32 rsvd[3];
|
||||
};
|
||||
|
||||
struct erdma_sge {
|
||||
__aligned_le64 laddr;
|
||||
__le32 length;
|
||||
__le32 lkey;
|
||||
};
|
||||
|
||||
/* Receive Queue Element */
|
||||
struct erdma_rqe {
|
||||
__le16 qe_idx;
|
||||
__le16 rsvd0;
|
||||
__le32 qpn;
|
||||
__le32 rsvd1;
|
||||
__le32 rsvd2;
|
||||
__le64 to;
|
||||
__le32 length;
|
||||
__le32 stag;
|
||||
};
|
||||
|
||||
/* SQE */
|
||||
#define ERDMA_SQE_HDR_SGL_LEN_MASK GENMASK_ULL(63, 56)
|
||||
#define ERDMA_SQE_HDR_WQEBB_CNT_MASK GENMASK_ULL(54, 52)
|
||||
#define ERDMA_SQE_HDR_QPN_MASK GENMASK_ULL(51, 32)
|
||||
#define ERDMA_SQE_HDR_OPCODE_MASK GENMASK_ULL(31, 27)
|
||||
#define ERDMA_SQE_HDR_DWQE_MASK BIT_ULL(26)
|
||||
#define ERDMA_SQE_HDR_INLINE_MASK BIT_ULL(25)
|
||||
#define ERDMA_SQE_HDR_FENCE_MASK BIT_ULL(24)
|
||||
#define ERDMA_SQE_HDR_SE_MASK BIT_ULL(23)
|
||||
#define ERDMA_SQE_HDR_CE_MASK BIT_ULL(22)
|
||||
#define ERDMA_SQE_HDR_WQEBB_INDEX_MASK GENMASK_ULL(15, 0)
|
||||
|
||||
/* REG MR attrs */
|
||||
#define ERDMA_SQE_MR_MODE_MASK GENMASK(1, 0)
|
||||
#define ERDMA_SQE_MR_ACCESS_MASK GENMASK(5, 2)
|
||||
#define ERDMA_SQE_MR_MTT_TYPE_MASK GENMASK(7, 6)
|
||||
#define ERDMA_SQE_MR_MTT_CNT_MASK GENMASK(31, 12)
|
||||
|
||||
struct erdma_write_sqe {
|
||||
__le64 hdr;
|
||||
__be32 imm_data;
|
||||
__le32 length;
|
||||
|
||||
__le32 sink_stag;
|
||||
__le32 sink_to_l;
|
||||
__le32 sink_to_h;
|
||||
|
||||
__le32 rsvd;
|
||||
|
||||
struct erdma_sge sgl[0];
|
||||
};
|
||||
|
||||
struct erdma_send_sqe {
|
||||
__le64 hdr;
|
||||
union {
|
||||
__be32 imm_data;
|
||||
__le32 invalid_stag;
|
||||
};
|
||||
|
||||
__le32 length;
|
||||
struct erdma_sge sgl[0];
|
||||
};
|
||||
|
||||
struct erdma_readreq_sqe {
|
||||
__le64 hdr;
|
||||
__le32 invalid_stag;
|
||||
__le32 length;
|
||||
__le32 sink_stag;
|
||||
__le32 sink_to_l;
|
||||
__le32 sink_to_h;
|
||||
__le32 rsvd;
|
||||
};
|
||||
|
||||
struct erdma_reg_mr_sqe {
|
||||
__le64 hdr;
|
||||
__le64 addr;
|
||||
__le32 length;
|
||||
__le32 stag;
|
||||
__le32 attrs;
|
||||
__le32 rsvd;
|
||||
};
|
||||
|
||||
/* EQ related. */
|
||||
#define ERDMA_DEFAULT_EQ_DEPTH 256
|
||||
|
||||
/* ceqe */
|
||||
#define ERDMA_CEQE_HDR_DB_MASK BIT_ULL(63)
|
||||
#define ERDMA_CEQE_HDR_PI_MASK GENMASK_ULL(55, 32)
|
||||
#define ERDMA_CEQE_HDR_O_MASK BIT_ULL(31)
|
||||
#define ERDMA_CEQE_HDR_CQN_MASK GENMASK_ULL(19, 0)
|
||||
|
||||
/* aeqe */
|
||||
#define ERDMA_AEQE_HDR_O_MASK BIT(31)
|
||||
#define ERDMA_AEQE_HDR_TYPE_MASK GENMASK(23, 16)
|
||||
#define ERDMA_AEQE_HDR_SUBTYPE_MASK GENMASK(7, 0)
|
||||
|
||||
#define ERDMA_AE_TYPE_QP_FATAL_EVENT 0
|
||||
#define ERDMA_AE_TYPE_QP_ERQ_ERR_EVENT 1
|
||||
#define ERDMA_AE_TYPE_ACC_ERR_EVENT 2
|
||||
#define ERDMA_AE_TYPE_CQ_ERR 3
|
||||
#define ERDMA_AE_TYPE_OTHER_ERROR 4
|
||||
|
||||
struct erdma_aeqe {
|
||||
__le32 hdr;
|
||||
__le32 event_data0;
|
||||
__le32 event_data1;
|
||||
__le32 rsvd;
|
||||
};
|
||||
|
||||
enum erdma_opcode {
|
||||
ERDMA_OP_WRITE = 0,
|
||||
ERDMA_OP_READ = 1,
|
||||
ERDMA_OP_SEND = 2,
|
||||
ERDMA_OP_SEND_WITH_IMM = 3,
|
||||
|
||||
ERDMA_OP_RECEIVE = 4,
|
||||
ERDMA_OP_RECV_IMM = 5,
|
||||
ERDMA_OP_RECV_INV = 6,
|
||||
|
||||
ERDMA_OP_REQ_ERR = 7,
|
||||
ERDMA_OP_READ_RESPONSE = 8,
|
||||
ERDMA_OP_WRITE_WITH_IMM = 9,
|
||||
|
||||
ERDMA_OP_RECV_ERR = 10,
|
||||
|
||||
ERDMA_OP_INVALIDATE = 11,
|
||||
ERDMA_OP_RSP_SEND_IMM = 12,
|
||||
ERDMA_OP_SEND_WITH_INV = 13,
|
||||
|
||||
ERDMA_OP_REG_MR = 14,
|
||||
ERDMA_OP_LOCAL_INV = 15,
|
||||
ERDMA_OP_READ_WITH_INV = 16,
|
||||
ERDMA_NUM_OPCODES = 17,
|
||||
ERDMA_OP_INVALID = ERDMA_NUM_OPCODES + 1
|
||||
};
|
||||
|
||||
enum erdma_wc_status {
|
||||
ERDMA_WC_SUCCESS = 0,
|
||||
ERDMA_WC_GENERAL_ERR = 1,
|
||||
ERDMA_WC_RECV_WQE_FORMAT_ERR = 2,
|
||||
ERDMA_WC_RECV_STAG_INVALID_ERR = 3,
|
||||
ERDMA_WC_RECV_ADDR_VIOLATION_ERR = 4,
|
||||
ERDMA_WC_RECV_RIGHT_VIOLATION_ERR = 5,
|
||||
ERDMA_WC_RECV_PDID_ERR = 6,
|
||||
ERDMA_WC_RECV_WARRPING_ERR = 7,
|
||||
ERDMA_WC_SEND_WQE_FORMAT_ERR = 8,
|
||||
ERDMA_WC_SEND_WQE_ORD_EXCEED = 9,
|
||||
ERDMA_WC_SEND_STAG_INVALID_ERR = 10,
|
||||
ERDMA_WC_SEND_ADDR_VIOLATION_ERR = 11,
|
||||
ERDMA_WC_SEND_RIGHT_VIOLATION_ERR = 12,
|
||||
ERDMA_WC_SEND_PDID_ERR = 13,
|
||||
ERDMA_WC_SEND_WARRPING_ERR = 14,
|
||||
ERDMA_WC_FLUSH_ERR = 15,
|
||||
ERDMA_WC_RETRY_EXC_ERR = 16,
|
||||
ERDMA_NUM_WC_STATUS
|
||||
};
|
||||
|
||||
enum erdma_vendor_err {
|
||||
ERDMA_WC_VENDOR_NO_ERR = 0,
|
||||
ERDMA_WC_VENDOR_INVALID_RQE = 1,
|
||||
ERDMA_WC_VENDOR_RQE_INVALID_STAG = 2,
|
||||
ERDMA_WC_VENDOR_RQE_ADDR_VIOLATION = 3,
|
||||
ERDMA_WC_VENDOR_RQE_ACCESS_RIGHT_ERR = 4,
|
||||
ERDMA_WC_VENDOR_RQE_INVALID_PD = 5,
|
||||
ERDMA_WC_VENDOR_RQE_WRAP_ERR = 6,
|
||||
ERDMA_WC_VENDOR_INVALID_SQE = 0x20,
|
||||
ERDMA_WC_VENDOR_ZERO_ORD = 0x21,
|
||||
ERDMA_WC_VENDOR_SQE_INVALID_STAG = 0x30,
|
||||
ERDMA_WC_VENDOR_SQE_ADDR_VIOLATION = 0x31,
|
||||
ERDMA_WC_VENDOR_SQE_ACCESS_ERR = 0x32,
|
||||
ERDMA_WC_VENDOR_SQE_INVALID_PD = 0x33,
|
||||
ERDMA_WC_VENDOR_SQE_WARP_ERR = 0x34
|
||||
};
|
||||
|
||||
#endif
|
||||
608
drivers/infiniband/hw/erdma/erdma_main.c
Normal file
608
drivers/infiniband/hw/erdma/erdma_main.c
Normal file
@@ -0,0 +1,608 @@
|
||||
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
|
||||
|
||||
/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
|
||||
/* Kai Shen <kaishen@linux.alibaba.com> */
|
||||
/* Copyright (c) 2020-2022, Alibaba Group. */
|
||||
|
||||
#include <linux/errno.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/pci.h>
|
||||
#include <net/addrconf.h>
|
||||
#include <rdma/erdma-abi.h>
|
||||
#include <rdma/ib_verbs.h>
|
||||
#include <rdma/ib_user_verbs.h>
|
||||
|
||||
#include "erdma.h"
|
||||
#include "erdma_cm.h"
|
||||
#include "erdma_hw.h"
|
||||
#include "erdma_verbs.h"
|
||||
|
||||
MODULE_AUTHOR("Cheng Xu <chengyou@linux.alibaba.com>");
|
||||
MODULE_DESCRIPTION("Alibaba elasticRDMA adapter driver");
|
||||
MODULE_LICENSE("Dual BSD/GPL");
|
||||
|
||||
static int erdma_netdev_event(struct notifier_block *nb, unsigned long event,
|
||||
void *arg)
|
||||
{
|
||||
struct net_device *netdev = netdev_notifier_info_to_dev(arg);
|
||||
struct erdma_dev *dev = container_of(nb, struct erdma_dev, netdev_nb);
|
||||
|
||||
if (dev->netdev == NULL || dev->netdev != netdev)
|
||||
goto done;
|
||||
|
||||
switch (event) {
|
||||
case NETDEV_UP:
|
||||
dev->state = IB_PORT_ACTIVE;
|
||||
erdma_port_event(dev, IB_EVENT_PORT_ACTIVE);
|
||||
break;
|
||||
case NETDEV_DOWN:
|
||||
dev->state = IB_PORT_DOWN;
|
||||
erdma_port_event(dev, IB_EVENT_PORT_ERR);
|
||||
break;
|
||||
case NETDEV_REGISTER:
|
||||
case NETDEV_UNREGISTER:
|
||||
case NETDEV_CHANGEADDR:
|
||||
case NETDEV_CHANGEMTU:
|
||||
case NETDEV_GOING_DOWN:
|
||||
case NETDEV_CHANGE:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
done:
|
||||
return NOTIFY_OK;
|
||||
}
|
||||
|
||||
static int erdma_enum_and_get_netdev(struct erdma_dev *dev)
|
||||
{
|
||||
struct net_device *netdev;
|
||||
int ret = -ENODEV;
|
||||
|
||||
/* Already binded to a net_device, so we skip. */
|
||||
if (dev->netdev)
|
||||
return 0;
|
||||
|
||||
rtnl_lock();
|
||||
for_each_netdev(&init_net, netdev) {
|
||||
/*
|
||||
* In erdma, the paired netdev and ibdev should have the same
|
||||
* MAC address. erdma can get the value from its PCIe bar
|
||||
* registers. Since erdma can not get the paired netdev
|
||||
* reference directly, we do a traverse here to get the paired
|
||||
* netdev.
|
||||
*/
|
||||
if (ether_addr_equal_unaligned(netdev->perm_addr,
|
||||
dev->attrs.peer_addr)) {
|
||||
ret = ib_device_set_netdev(&dev->ibdev, netdev, 1);
|
||||
if (ret) {
|
||||
rtnl_unlock();
|
||||
ibdev_warn(&dev->ibdev,
|
||||
"failed (%d) to link netdev", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
dev->netdev = netdev;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
rtnl_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int erdma_device_register(struct erdma_dev *dev)
|
||||
{
|
||||
struct ib_device *ibdev = &dev->ibdev;
|
||||
int ret;
|
||||
|
||||
ret = erdma_enum_and_get_netdev(dev);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
addrconf_addr_eui48((u8 *)&ibdev->node_guid, dev->netdev->dev_addr);
|
||||
|
||||
ret = ib_register_device(ibdev, "erdma_%d", &dev->pdev->dev);
|
||||
if (ret) {
|
||||
dev_err(&dev->pdev->dev,
|
||||
"ib_register_device failed: ret = %d\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
dev->netdev_nb.notifier_call = erdma_netdev_event;
|
||||
ret = register_netdevice_notifier(&dev->netdev_nb);
|
||||
if (ret) {
|
||||
ibdev_err(&dev->ibdev, "failed to register notifier.\n");
|
||||
ib_unregister_device(ibdev);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static irqreturn_t erdma_comm_irq_handler(int irq, void *data)
|
||||
{
|
||||
struct erdma_dev *dev = data;
|
||||
|
||||
erdma_cmdq_completion_handler(&dev->cmdq);
|
||||
erdma_aeq_event_handler(dev);
|
||||
|
||||
return IRQ_HANDLED;
|
||||
}
|
||||
|
||||
static void erdma_dwqe_resource_init(struct erdma_dev *dev)
|
||||
{
|
||||
int total_pages, type0, type1;
|
||||
|
||||
dev->attrs.grp_num = erdma_reg_read32(dev, ERDMA_REGS_GRP_NUM_REG);
|
||||
|
||||
if (dev->attrs.grp_num < 4)
|
||||
dev->attrs.disable_dwqe = true;
|
||||
else
|
||||
dev->attrs.disable_dwqe = false;
|
||||
|
||||
/* One page contains 4 goups. */
|
||||
total_pages = dev->attrs.grp_num * 4;
|
||||
|
||||
if (dev->attrs.grp_num >= ERDMA_DWQE_MAX_GRP_CNT) {
|
||||
dev->attrs.grp_num = ERDMA_DWQE_MAX_GRP_CNT;
|
||||
type0 = ERDMA_DWQE_TYPE0_CNT;
|
||||
type1 = ERDMA_DWQE_TYPE1_CNT / ERDMA_DWQE_TYPE1_CNT_PER_PAGE;
|
||||
} else {
|
||||
type1 = total_pages / 3;
|
||||
type0 = total_pages - type1 - 1;
|
||||
}
|
||||
|
||||
dev->attrs.dwqe_pages = type0;
|
||||
dev->attrs.dwqe_entries = type1 * ERDMA_DWQE_TYPE1_CNT_PER_PAGE;
|
||||
}
|
||||
|
||||
static int erdma_request_vectors(struct erdma_dev *dev)
|
||||
{
|
||||
int expect_irq_num = min(num_possible_cpus() + 1, ERDMA_NUM_MSIX_VEC);
|
||||
int ret;
|
||||
|
||||
ret = pci_alloc_irq_vectors(dev->pdev, 1, expect_irq_num, PCI_IRQ_MSIX);
|
||||
if (ret < 0) {
|
||||
dev_err(&dev->pdev->dev, "request irq vectors failed(%d)\n",
|
||||
ret);
|
||||
return ret;
|
||||
}
|
||||
dev->attrs.irq_num = ret;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int erdma_comm_irq_init(struct erdma_dev *dev)
|
||||
{
|
||||
snprintf(dev->comm_irq.name, ERDMA_IRQNAME_SIZE, "erdma-common@pci:%s",
|
||||
pci_name(dev->pdev));
|
||||
dev->comm_irq.msix_vector =
|
||||
pci_irq_vector(dev->pdev, ERDMA_MSIX_VECTOR_CMDQ);
|
||||
|
||||
cpumask_set_cpu(cpumask_first(cpumask_of_pcibus(dev->pdev->bus)),
|
||||
&dev->comm_irq.affinity_hint_mask);
|
||||
irq_set_affinity_hint(dev->comm_irq.msix_vector,
|
||||
&dev->comm_irq.affinity_hint_mask);
|
||||
|
||||
return request_irq(dev->comm_irq.msix_vector, erdma_comm_irq_handler, 0,
|
||||
dev->comm_irq.name, dev);
|
||||
}
|
||||
|
||||
static void erdma_comm_irq_uninit(struct erdma_dev *dev)
|
||||
{
|
||||
irq_set_affinity_hint(dev->comm_irq.msix_vector, NULL);
|
||||
free_irq(dev->comm_irq.msix_vector, dev);
|
||||
}
|
||||
|
||||
static int erdma_device_init(struct erdma_dev *dev, struct pci_dev *pdev)
|
||||
{
|
||||
int ret;
|
||||
|
||||
erdma_dwqe_resource_init(dev);
|
||||
|
||||
ret = dma_set_mask_and_coherent(&pdev->dev,
|
||||
DMA_BIT_MASK(ERDMA_PCI_WIDTH));
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
dma_set_max_seg_size(&pdev->dev, UINT_MAX);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void erdma_device_uninit(struct erdma_dev *dev)
|
||||
{
|
||||
u32 ctrl = FIELD_PREP(ERDMA_REG_DEV_CTRL_RESET_MASK, 1);
|
||||
|
||||
erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG, ctrl);
|
||||
}
|
||||
|
||||
static const struct pci_device_id erdma_pci_tbl[] = {
|
||||
{ PCI_DEVICE(PCI_VENDOR_ID_ALIBABA, 0x107f) },
|
||||
{}
|
||||
};
|
||||
|
||||
static int erdma_probe_dev(struct pci_dev *pdev)
|
||||
{
|
||||
struct erdma_dev *dev;
|
||||
int bars, err;
|
||||
u32 version;
|
||||
|
||||
err = pci_enable_device(pdev);
|
||||
if (err) {
|
||||
dev_err(&pdev->dev, "pci_enable_device failed(%d)\n", err);
|
||||
return err;
|
||||
}
|
||||
|
||||
pci_set_master(pdev);
|
||||
|
||||
dev = ib_alloc_device(erdma_dev, ibdev);
|
||||
if (!dev) {
|
||||
dev_err(&pdev->dev, "ib_alloc_device failed\n");
|
||||
err = -ENOMEM;
|
||||
goto err_disable_device;
|
||||
}
|
||||
|
||||
pci_set_drvdata(pdev, dev);
|
||||
dev->pdev = pdev;
|
||||
dev->attrs.numa_node = dev_to_node(&pdev->dev);
|
||||
|
||||
bars = pci_select_bars(pdev, IORESOURCE_MEM);
|
||||
err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME);
|
||||
if (bars != ERDMA_BAR_MASK || err) {
|
||||
err = err ? err : -EINVAL;
|
||||
goto err_ib_device_release;
|
||||
}
|
||||
|
||||
dev->func_bar_addr = pci_resource_start(pdev, ERDMA_FUNC_BAR);
|
||||
dev->func_bar_len = pci_resource_len(pdev, ERDMA_FUNC_BAR);
|
||||
|
||||
dev->func_bar =
|
||||
devm_ioremap(&pdev->dev, dev->func_bar_addr, dev->func_bar_len);
|
||||
if (!dev->func_bar) {
|
||||
dev_err(&pdev->dev, "devm_ioremap failed.\n");
|
||||
err = -EFAULT;
|
||||
goto err_release_bars;
|
||||
}
|
||||
|
||||
version = erdma_reg_read32(dev, ERDMA_REGS_VERSION_REG);
|
||||
if (version == 0) {
|
||||
/* we knows that it is a non-functional function. */
|
||||
err = -ENODEV;
|
||||
goto err_iounmap_func_bar;
|
||||
}
|
||||
|
||||
err = erdma_device_init(dev, pdev);
|
||||
if (err)
|
||||
goto err_iounmap_func_bar;
|
||||
|
||||
err = erdma_request_vectors(dev);
|
||||
if (err)
|
||||
goto err_iounmap_func_bar;
|
||||
|
||||
err = erdma_comm_irq_init(dev);
|
||||
if (err)
|
||||
goto err_free_vectors;
|
||||
|
||||
err = erdma_aeq_init(dev);
|
||||
if (err)
|
||||
goto err_uninit_comm_irq;
|
||||
|
||||
err = erdma_cmdq_init(dev);
|
||||
if (err)
|
||||
goto err_uninit_aeq;
|
||||
|
||||
err = erdma_ceqs_init(dev);
|
||||
if (err)
|
||||
goto err_uninit_cmdq;
|
||||
|
||||
erdma_finish_cmdq_init(dev);
|
||||
|
||||
return 0;
|
||||
|
||||
err_uninit_cmdq:
|
||||
erdma_device_uninit(dev);
|
||||
erdma_cmdq_destroy(dev);
|
||||
|
||||
err_uninit_aeq:
|
||||
erdma_aeq_destroy(dev);
|
||||
|
||||
err_uninit_comm_irq:
|
||||
erdma_comm_irq_uninit(dev);
|
||||
|
||||
err_free_vectors:
|
||||
pci_free_irq_vectors(dev->pdev);
|
||||
|
||||
err_iounmap_func_bar:
|
||||
devm_iounmap(&pdev->dev, dev->func_bar);
|
||||
|
||||
err_release_bars:
|
||||
pci_release_selected_regions(pdev, bars);
|
||||
|
||||
err_ib_device_release:
|
||||
ib_dealloc_device(&dev->ibdev);
|
||||
|
||||
err_disable_device:
|
||||
pci_disable_device(pdev);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static void erdma_remove_dev(struct pci_dev *pdev)
|
||||
{
|
||||
struct erdma_dev *dev = pci_get_drvdata(pdev);
|
||||
|
||||
erdma_ceqs_uninit(dev);
|
||||
|
||||
erdma_device_uninit(dev);
|
||||
|
||||
erdma_cmdq_destroy(dev);
|
||||
erdma_aeq_destroy(dev);
|
||||
erdma_comm_irq_uninit(dev);
|
||||
pci_free_irq_vectors(dev->pdev);
|
||||
|
||||
devm_iounmap(&pdev->dev, dev->func_bar);
|
||||
pci_release_selected_regions(pdev, ERDMA_BAR_MASK);
|
||||
|
||||
ib_dealloc_device(&dev->ibdev);
|
||||
|
||||
pci_disable_device(pdev);
|
||||
}
|
||||
|
||||
#define ERDMA_GET_CAP(name, cap) FIELD_GET(ERDMA_CMD_DEV_CAP_##name##_MASK, cap)
|
||||
|
||||
static int erdma_dev_attrs_init(struct erdma_dev *dev)
|
||||
{
|
||||
int err;
|
||||
u64 req_hdr, cap0, cap1;
|
||||
|
||||
erdma_cmdq_build_reqhdr(&req_hdr, CMDQ_SUBMOD_RDMA,
|
||||
CMDQ_OPCODE_QUERY_DEVICE);
|
||||
|
||||
err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0,
|
||||
&cap1);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
dev->attrs.max_cqe = 1 << ERDMA_GET_CAP(MAX_CQE, cap0);
|
||||
dev->attrs.max_mr_size = 1ULL << ERDMA_GET_CAP(MAX_MR_SIZE, cap0);
|
||||
dev->attrs.max_mw = 1 << ERDMA_GET_CAP(MAX_MW, cap1);
|
||||
dev->attrs.max_recv_wr = 1 << ERDMA_GET_CAP(MAX_RECV_WR, cap0);
|
||||
dev->attrs.local_dma_key = ERDMA_GET_CAP(DMA_LOCAL_KEY, cap1);
|
||||
dev->attrs.cc = ERDMA_GET_CAP(DEFAULT_CC, cap1);
|
||||
dev->attrs.max_qp = ERDMA_NQP_PER_QBLOCK * ERDMA_GET_CAP(QBLOCK, cap1);
|
||||
dev->attrs.max_mr = dev->attrs.max_qp << 1;
|
||||
dev->attrs.max_cq = dev->attrs.max_qp << 1;
|
||||
|
||||
dev->attrs.max_send_wr = ERDMA_MAX_SEND_WR;
|
||||
dev->attrs.max_ord = ERDMA_MAX_ORD;
|
||||
dev->attrs.max_ird = ERDMA_MAX_IRD;
|
||||
dev->attrs.max_send_sge = ERDMA_MAX_SEND_SGE;
|
||||
dev->attrs.max_recv_sge = ERDMA_MAX_RECV_SGE;
|
||||
dev->attrs.max_sge_rd = ERDMA_MAX_SGE_RD;
|
||||
dev->attrs.max_pd = ERDMA_MAX_PD;
|
||||
|
||||
dev->res_cb[ERDMA_RES_TYPE_PD].max_cap = ERDMA_MAX_PD;
|
||||
dev->res_cb[ERDMA_RES_TYPE_STAG_IDX].max_cap = dev->attrs.max_mr;
|
||||
|
||||
erdma_cmdq_build_reqhdr(&req_hdr, CMDQ_SUBMOD_COMMON,
|
||||
CMDQ_OPCODE_QUERY_FW_INFO);
|
||||
|
||||
err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0,
|
||||
&cap1);
|
||||
if (!err)
|
||||
dev->attrs.fw_version =
|
||||
FIELD_GET(ERDMA_CMD_INFO0_FW_VER_MASK, cap0);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int erdma_res_cb_init(struct erdma_dev *dev)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < ERDMA_RES_CNT; i++) {
|
||||
dev->res_cb[i].next_alloc_idx = 1;
|
||||
spin_lock_init(&dev->res_cb[i].lock);
|
||||
dev->res_cb[i].bitmap =
|
||||
bitmap_zalloc(dev->res_cb[i].max_cap, GFP_KERNEL);
|
||||
if (!dev->res_cb[i].bitmap)
|
||||
goto err;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
err:
|
||||
for (j = 0; j < i; j++)
|
||||
bitmap_free(dev->res_cb[j].bitmap);
|
||||
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static void erdma_res_cb_free(struct erdma_dev *dev)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ERDMA_RES_CNT; i++)
|
||||
bitmap_free(dev->res_cb[i].bitmap);
|
||||
}
|
||||
|
||||
static const struct ib_device_ops erdma_device_ops = {
|
||||
.owner = THIS_MODULE,
|
||||
.driver_id = RDMA_DRIVER_ERDMA,
|
||||
.uverbs_abi_ver = ERDMA_ABI_VERSION,
|
||||
|
||||
.alloc_mr = erdma_ib_alloc_mr,
|
||||
.alloc_pd = erdma_alloc_pd,
|
||||
.alloc_ucontext = erdma_alloc_ucontext,
|
||||
.create_cq = erdma_create_cq,
|
||||
.create_qp = erdma_create_qp,
|
||||
.dealloc_pd = erdma_dealloc_pd,
|
||||
.dealloc_ucontext = erdma_dealloc_ucontext,
|
||||
.dereg_mr = erdma_dereg_mr,
|
||||
.destroy_cq = erdma_destroy_cq,
|
||||
.destroy_qp = erdma_destroy_qp,
|
||||
.get_dma_mr = erdma_get_dma_mr,
|
||||
.get_port_immutable = erdma_get_port_immutable,
|
||||
.iw_accept = erdma_accept,
|
||||
.iw_add_ref = erdma_qp_get_ref,
|
||||
.iw_connect = erdma_connect,
|
||||
.iw_create_listen = erdma_create_listen,
|
||||
.iw_destroy_listen = erdma_destroy_listen,
|
||||
.iw_get_qp = erdma_get_ibqp,
|
||||
.iw_reject = erdma_reject,
|
||||
.iw_rem_ref = erdma_qp_put_ref,
|
||||
.map_mr_sg = erdma_map_mr_sg,
|
||||
.mmap = erdma_mmap,
|
||||
.mmap_free = erdma_mmap_free,
|
||||
.modify_qp = erdma_modify_qp,
|
||||
.post_recv = erdma_post_recv,
|
||||
.post_send = erdma_post_send,
|
||||
.poll_cq = erdma_poll_cq,
|
||||
.query_device = erdma_query_device,
|
||||
.query_gid = erdma_query_gid,
|
||||
.query_port = erdma_query_port,
|
||||
.query_qp = erdma_query_qp,
|
||||
.req_notify_cq = erdma_req_notify_cq,
|
||||
.reg_user_mr = erdma_reg_user_mr,
|
||||
|
||||
INIT_RDMA_OBJ_SIZE(ib_cq, erdma_cq, ibcq),
|
||||
INIT_RDMA_OBJ_SIZE(ib_pd, erdma_pd, ibpd),
|
||||
INIT_RDMA_OBJ_SIZE(ib_ucontext, erdma_ucontext, ibucontext),
|
||||
INIT_RDMA_OBJ_SIZE(ib_qp, erdma_qp, ibqp),
|
||||
};
|
||||
|
||||
static int erdma_ib_device_add(struct pci_dev *pdev)
|
||||
{
|
||||
struct erdma_dev *dev = pci_get_drvdata(pdev);
|
||||
struct ib_device *ibdev = &dev->ibdev;
|
||||
u64 mac;
|
||||
int ret;
|
||||
|
||||
ret = erdma_dev_attrs_init(dev);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ibdev->node_type = RDMA_NODE_RNIC;
|
||||
memcpy(ibdev->node_desc, ERDMA_NODE_DESC, sizeof(ERDMA_NODE_DESC));
|
||||
|
||||
/*
|
||||
* Current model (one-to-one device association):
|
||||
* One ERDMA device per net_device or, equivalently,
|
||||
* per physical port.
|
||||
*/
|
||||
ibdev->phys_port_cnt = 1;
|
||||
ibdev->num_comp_vectors = dev->attrs.irq_num - 1;
|
||||
|
||||
ib_set_device_ops(ibdev, &erdma_device_ops);
|
||||
|
||||
INIT_LIST_HEAD(&dev->cep_list);
|
||||
|
||||
spin_lock_init(&dev->lock);
|
||||
xa_init_flags(&dev->qp_xa, XA_FLAGS_ALLOC1);
|
||||
xa_init_flags(&dev->cq_xa, XA_FLAGS_ALLOC1);
|
||||
dev->next_alloc_cqn = 1;
|
||||
dev->next_alloc_qpn = 1;
|
||||
|
||||
ret = erdma_res_cb_init(dev);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
spin_lock_init(&dev->db_bitmap_lock);
|
||||
bitmap_zero(dev->sdb_page, ERDMA_DWQE_TYPE0_CNT);
|
||||
bitmap_zero(dev->sdb_entry, ERDMA_DWQE_TYPE1_CNT);
|
||||
|
||||
atomic_set(&dev->num_ctx, 0);
|
||||
|
||||
mac = erdma_reg_read32(dev, ERDMA_REGS_NETDEV_MAC_L_REG);
|
||||
mac |= (u64)erdma_reg_read32(dev, ERDMA_REGS_NETDEV_MAC_H_REG) << 32;
|
||||
|
||||
u64_to_ether_addr(mac, dev->attrs.peer_addr);
|
||||
|
||||
ret = erdma_device_register(dev);
|
||||
if (ret)
|
||||
goto err_out;
|
||||
|
||||
return 0;
|
||||
|
||||
err_out:
|
||||
xa_destroy(&dev->qp_xa);
|
||||
xa_destroy(&dev->cq_xa);
|
||||
|
||||
erdma_res_cb_free(dev);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void erdma_ib_device_remove(struct pci_dev *pdev)
|
||||
{
|
||||
struct erdma_dev *dev = pci_get_drvdata(pdev);
|
||||
|
||||
unregister_netdevice_notifier(&dev->netdev_nb);
|
||||
ib_unregister_device(&dev->ibdev);
|
||||
|
||||
erdma_res_cb_free(dev);
|
||||
xa_destroy(&dev->qp_xa);
|
||||
xa_destroy(&dev->cq_xa);
|
||||
}
|
||||
|
||||
static int erdma_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = erdma_probe_dev(pdev);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = erdma_ib_device_add(pdev);
|
||||
if (ret) {
|
||||
erdma_remove_dev(pdev);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void erdma_remove(struct pci_dev *pdev)
|
||||
{
|
||||
erdma_ib_device_remove(pdev);
|
||||
erdma_remove_dev(pdev);
|
||||
}
|
||||
|
||||
static struct pci_driver erdma_pci_driver = {
|
||||
.name = DRV_MODULE_NAME,
|
||||
.id_table = erdma_pci_tbl,
|
||||
.probe = erdma_probe,
|
||||
.remove = erdma_remove
|
||||
};
|
||||
|
||||
MODULE_DEVICE_TABLE(pci, erdma_pci_tbl);
|
||||
|
||||
static __init int erdma_init_module(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = erdma_cm_init();
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = pci_register_driver(&erdma_pci_driver);
|
||||
if (ret)
|
||||
erdma_cm_exit();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __exit erdma_exit_module(void)
|
||||
{
|
||||
pci_unregister_driver(&erdma_pci_driver);
|
||||
|
||||
erdma_cm_exit();
|
||||
}
|
||||
|
||||
module_init(erdma_init_module);
|
||||
module_exit(erdma_exit_module);
|
||||
566
drivers/infiniband/hw/erdma/erdma_qp.c
Normal file
566
drivers/infiniband/hw/erdma/erdma_qp.c
Normal file
@@ -0,0 +1,566 @@
|
||||
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
|
||||
|
||||
/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
|
||||
/* Kai Shen <kaishen@linux.alibaba.com> */
|
||||
/* Copyright (c) 2020-2021, Alibaba Group */
|
||||
/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
|
||||
/* Copyright (c) 2008-2019, IBM Corporation */
|
||||
|
||||
#include <linux/errno.h>
|
||||
#include <linux/pci.h>
|
||||
#include <linux/scatterlist.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
#include <rdma/ib_user_verbs.h>
|
||||
#include <rdma/ib_verbs.h>
|
||||
|
||||
#include "erdma.h"
|
||||
#include "erdma_cm.h"
|
||||
#include "erdma_verbs.h"
|
||||
|
||||
void erdma_qp_llp_close(struct erdma_qp *qp)
|
||||
{
|
||||
struct erdma_qp_attrs qp_attrs;
|
||||
|
||||
down_write(&qp->state_lock);
|
||||
|
||||
switch (qp->attrs.state) {
|
||||
case ERDMA_QP_STATE_RTS:
|
||||
case ERDMA_QP_STATE_RTR:
|
||||
case ERDMA_QP_STATE_IDLE:
|
||||
case ERDMA_QP_STATE_TERMINATE:
|
||||
qp_attrs.state = ERDMA_QP_STATE_CLOSING;
|
||||
erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE);
|
||||
break;
|
||||
case ERDMA_QP_STATE_CLOSING:
|
||||
qp->attrs.state = ERDMA_QP_STATE_IDLE;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
if (qp->cep) {
|
||||
erdma_cep_put(qp->cep);
|
||||
qp->cep = NULL;
|
||||
}
|
||||
|
||||
up_write(&qp->state_lock);
|
||||
}
|
||||
|
||||
struct ib_qp *erdma_get_ibqp(struct ib_device *ibdev, int id)
|
||||
{
|
||||
struct erdma_qp *qp = find_qp_by_qpn(to_edev(ibdev), id);
|
||||
|
||||
if (qp)
|
||||
return &qp->ibqp;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int erdma_modify_qp_state_to_rts(struct erdma_qp *qp,
|
||||
struct erdma_qp_attrs *attrs,
|
||||
enum erdma_qp_attr_mask mask)
|
||||
{
|
||||
int ret;
|
||||
struct erdma_dev *dev = qp->dev;
|
||||
struct erdma_cmdq_modify_qp_req req;
|
||||
struct tcp_sock *tp;
|
||||
struct erdma_cep *cep = qp->cep;
|
||||
struct sockaddr_storage local_addr, remote_addr;
|
||||
|
||||
if (!(mask & ERDMA_QP_ATTR_LLP_HANDLE))
|
||||
return -EINVAL;
|
||||
|
||||
if (!(mask & ERDMA_QP_ATTR_MPA))
|
||||
return -EINVAL;
|
||||
|
||||
ret = getname_local(cep->sock, &local_addr);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
ret = getname_peer(cep->sock, &remote_addr);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
qp->attrs.state = ERDMA_QP_STATE_RTS;
|
||||
|
||||
tp = tcp_sk(qp->cep->sock->sk);
|
||||
|
||||
erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA,
|
||||
CMDQ_OPCODE_MODIFY_QP);
|
||||
|
||||
req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, qp->attrs.state) |
|
||||
FIELD_PREP(ERDMA_CMD_MODIFY_QP_CC_MASK, qp->attrs.cc) |
|
||||
FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp));
|
||||
|
||||
req.cookie = be32_to_cpu(qp->cep->mpa.ext_data.cookie);
|
||||
req.dip = to_sockaddr_in(remote_addr).sin_addr.s_addr;
|
||||
req.sip = to_sockaddr_in(local_addr).sin_addr.s_addr;
|
||||
req.dport = to_sockaddr_in(remote_addr).sin_port;
|
||||
req.sport = to_sockaddr_in(local_addr).sin_port;
|
||||
|
||||
req.send_nxt = tp->snd_nxt;
|
||||
/* rsvd tcp seq for mpa-rsp in server. */
|
||||
if (qp->attrs.qp_type == ERDMA_QP_PASSIVE)
|
||||
req.send_nxt += MPA_DEFAULT_HDR_LEN + qp->attrs.pd_len;
|
||||
req.recv_nxt = tp->rcv_nxt;
|
||||
|
||||
return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL,
|
||||
NULL);
|
||||
}
|
||||
|
||||
static int erdma_modify_qp_state_to_stop(struct erdma_qp *qp,
|
||||
struct erdma_qp_attrs *attrs,
|
||||
enum erdma_qp_attr_mask mask)
|
||||
{
|
||||
struct erdma_dev *dev = qp->dev;
|
||||
struct erdma_cmdq_modify_qp_req req;
|
||||
|
||||
qp->attrs.state = attrs->state;
|
||||
|
||||
erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA,
|
||||
CMDQ_OPCODE_MODIFY_QP);
|
||||
|
||||
req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, attrs->state) |
|
||||
FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp));
|
||||
|
||||
return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL,
|
||||
NULL);
|
||||
}
|
||||
|
||||
int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs,
|
||||
enum erdma_qp_attr_mask mask)
|
||||
{
|
||||
int drop_conn, ret = 0;
|
||||
|
||||
if (!mask)
|
||||
return 0;
|
||||
|
||||
if (!(mask & ERDMA_QP_ATTR_STATE))
|
||||
return 0;
|
||||
|
||||
switch (qp->attrs.state) {
|
||||
case ERDMA_QP_STATE_IDLE:
|
||||
case ERDMA_QP_STATE_RTR:
|
||||
if (attrs->state == ERDMA_QP_STATE_RTS) {
|
||||
ret = erdma_modify_qp_state_to_rts(qp, attrs, mask);
|
||||
} else if (attrs->state == ERDMA_QP_STATE_ERROR) {
|
||||
qp->attrs.state = ERDMA_QP_STATE_ERROR;
|
||||
if (qp->cep) {
|
||||
erdma_cep_put(qp->cep);
|
||||
qp->cep = NULL;
|
||||
}
|
||||
ret = erdma_modify_qp_state_to_stop(qp, attrs, mask);
|
||||
}
|
||||
break;
|
||||
case ERDMA_QP_STATE_RTS:
|
||||
drop_conn = 0;
|
||||
|
||||
if (attrs->state == ERDMA_QP_STATE_CLOSING) {
|
||||
ret = erdma_modify_qp_state_to_stop(qp, attrs, mask);
|
||||
drop_conn = 1;
|
||||
} else if (attrs->state == ERDMA_QP_STATE_TERMINATE) {
|
||||
qp->attrs.state = ERDMA_QP_STATE_TERMINATE;
|
||||
ret = erdma_modify_qp_state_to_stop(qp, attrs, mask);
|
||||
drop_conn = 1;
|
||||
} else if (attrs->state == ERDMA_QP_STATE_ERROR) {
|
||||
ret = erdma_modify_qp_state_to_stop(qp, attrs, mask);
|
||||
qp->attrs.state = ERDMA_QP_STATE_ERROR;
|
||||
drop_conn = 1;
|
||||
}
|
||||
|
||||
if (drop_conn)
|
||||
erdma_qp_cm_drop(qp);
|
||||
|
||||
break;
|
||||
case ERDMA_QP_STATE_TERMINATE:
|
||||
if (attrs->state == ERDMA_QP_STATE_ERROR)
|
||||
qp->attrs.state = ERDMA_QP_STATE_ERROR;
|
||||
break;
|
||||
case ERDMA_QP_STATE_CLOSING:
|
||||
if (attrs->state == ERDMA_QP_STATE_IDLE) {
|
||||
qp->attrs.state = ERDMA_QP_STATE_IDLE;
|
||||
} else if (attrs->state == ERDMA_QP_STATE_ERROR) {
|
||||
ret = erdma_modify_qp_state_to_stop(qp, attrs, mask);
|
||||
qp->attrs.state = ERDMA_QP_STATE_ERROR;
|
||||
} else if (attrs->state != ERDMA_QP_STATE_CLOSING) {
|
||||
return -ECONNABORTED;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void erdma_qp_safe_free(struct kref *ref)
|
||||
{
|
||||
struct erdma_qp *qp = container_of(ref, struct erdma_qp, ref);
|
||||
|
||||
complete(&qp->safe_free);
|
||||
}
|
||||
|
||||
void erdma_qp_put(struct erdma_qp *qp)
|
||||
{
|
||||
WARN_ON(kref_read(&qp->ref) < 1);
|
||||
kref_put(&qp->ref, erdma_qp_safe_free);
|
||||
}
|
||||
|
||||
void erdma_qp_get(struct erdma_qp *qp)
|
||||
{
|
||||
kref_get(&qp->ref);
|
||||
}
|
||||
|
||||
static int fill_inline_data(struct erdma_qp *qp,
|
||||
const struct ib_send_wr *send_wr, u16 wqe_idx,
|
||||
u32 sgl_offset, __le32 *length_field)
|
||||
{
|
||||
u32 remain_size, copy_size, data_off, bytes = 0;
|
||||
char *data;
|
||||
int i = 0;
|
||||
|
||||
wqe_idx += (sgl_offset >> SQEBB_SHIFT);
|
||||
sgl_offset &= (SQEBB_SIZE - 1);
|
||||
data = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx, qp->attrs.sq_size,
|
||||
SQEBB_SHIFT);
|
||||
|
||||
while (i < send_wr->num_sge) {
|
||||
bytes += send_wr->sg_list[i].length;
|
||||
if (bytes > (int)ERDMA_MAX_INLINE)
|
||||
return -EINVAL;
|
||||
|
||||
remain_size = send_wr->sg_list[i].length;
|
||||
data_off = 0;
|
||||
|
||||
while (1) {
|
||||
copy_size = min(remain_size, SQEBB_SIZE - sgl_offset);
|
||||
|
||||
memcpy(data + sgl_offset,
|
||||
(void *)(uintptr_t)send_wr->sg_list[i].addr +
|
||||
data_off,
|
||||
copy_size);
|
||||
remain_size -= copy_size;
|
||||
data_off += copy_size;
|
||||
sgl_offset += copy_size;
|
||||
wqe_idx += (sgl_offset >> SQEBB_SHIFT);
|
||||
sgl_offset &= (SQEBB_SIZE - 1);
|
||||
|
||||
data = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx,
|
||||
qp->attrs.sq_size, SQEBB_SHIFT);
|
||||
if (!remain_size)
|
||||
break;
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
*length_field = cpu_to_le32(bytes);
|
||||
|
||||
return bytes;
|
||||
}
|
||||
|
||||
static int fill_sgl(struct erdma_qp *qp, const struct ib_send_wr *send_wr,
|
||||
u16 wqe_idx, u32 sgl_offset, __le32 *length_field)
|
||||
{
|
||||
int i = 0;
|
||||
u32 bytes = 0;
|
||||
char *sgl;
|
||||
|
||||
if (send_wr->num_sge > qp->dev->attrs.max_send_sge)
|
||||
return -EINVAL;
|
||||
|
||||
if (sgl_offset & 0xF)
|
||||
return -EINVAL;
|
||||
|
||||
while (i < send_wr->num_sge) {
|
||||
wqe_idx += (sgl_offset >> SQEBB_SHIFT);
|
||||
sgl_offset &= (SQEBB_SIZE - 1);
|
||||
sgl = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx,
|
||||
qp->attrs.sq_size, SQEBB_SHIFT);
|
||||
|
||||
bytes += send_wr->sg_list[i].length;
|
||||
memcpy(sgl + sgl_offset, &send_wr->sg_list[i],
|
||||
sizeof(struct ib_sge));
|
||||
|
||||
sgl_offset += sizeof(struct ib_sge);
|
||||
i++;
|
||||
}
|
||||
|
||||
*length_field = cpu_to_le32(bytes);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi,
|
||||
const struct ib_send_wr *send_wr)
|
||||
{
|
||||
u32 wqe_size, wqebb_cnt, hw_op, flags, sgl_offset;
|
||||
u32 idx = *pi & (qp->attrs.sq_size - 1);
|
||||
enum ib_wr_opcode op = send_wr->opcode;
|
||||
struct erdma_readreq_sqe *read_sqe;
|
||||
struct erdma_reg_mr_sqe *regmr_sge;
|
||||
struct erdma_write_sqe *write_sqe;
|
||||
struct erdma_send_sqe *send_sqe;
|
||||
struct ib_rdma_wr *rdma_wr;
|
||||
struct erdma_mr *mr;
|
||||
__le32 *length_field;
|
||||
u64 wqe_hdr, *entry;
|
||||
struct ib_sge *sge;
|
||||
u32 attrs;
|
||||
int ret;
|
||||
|
||||
entry = get_queue_entry(qp->kern_qp.sq_buf, idx, qp->attrs.sq_size,
|
||||
SQEBB_SHIFT);
|
||||
|
||||
/* Clear the SQE header section. */
|
||||
*entry = 0;
|
||||
|
||||
qp->kern_qp.swr_tbl[idx] = send_wr->wr_id;
|
||||
flags = send_wr->send_flags;
|
||||
wqe_hdr = FIELD_PREP(
|
||||
ERDMA_SQE_HDR_CE_MASK,
|
||||
((flags & IB_SEND_SIGNALED) || qp->kern_qp.sig_all) ? 1 : 0);
|
||||
wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SE_MASK,
|
||||
flags & IB_SEND_SOLICITED ? 1 : 0);
|
||||
wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_FENCE_MASK,
|
||||
flags & IB_SEND_FENCE ? 1 : 0);
|
||||
wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_INLINE_MASK,
|
||||
flags & IB_SEND_INLINE ? 1 : 0);
|
||||
wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, QP_ID(qp));
|
||||
|
||||
switch (op) {
|
||||
case IB_WR_RDMA_WRITE:
|
||||
case IB_WR_RDMA_WRITE_WITH_IMM:
|
||||
hw_op = ERDMA_OP_WRITE;
|
||||
if (op == IB_WR_RDMA_WRITE_WITH_IMM)
|
||||
hw_op = ERDMA_OP_WRITE_WITH_IMM;
|
||||
wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op);
|
||||
rdma_wr = container_of(send_wr, struct ib_rdma_wr, wr);
|
||||
write_sqe = (struct erdma_write_sqe *)entry;
|
||||
|
||||
write_sqe->imm_data = send_wr->ex.imm_data;
|
||||
write_sqe->sink_stag = cpu_to_le32(rdma_wr->rkey);
|
||||
write_sqe->sink_to_h =
|
||||
cpu_to_le32(upper_32_bits(rdma_wr->remote_addr));
|
||||
write_sqe->sink_to_l =
|
||||
cpu_to_le32(lower_32_bits(rdma_wr->remote_addr));
|
||||
|
||||
length_field = &write_sqe->length;
|
||||
wqe_size = sizeof(struct erdma_write_sqe);
|
||||
sgl_offset = wqe_size;
|
||||
break;
|
||||
case IB_WR_RDMA_READ:
|
||||
case IB_WR_RDMA_READ_WITH_INV:
|
||||
read_sqe = (struct erdma_readreq_sqe *)entry;
|
||||
if (unlikely(send_wr->num_sge != 1))
|
||||
return -EINVAL;
|
||||
hw_op = ERDMA_OP_READ;
|
||||
if (op == IB_WR_RDMA_READ_WITH_INV) {
|
||||
hw_op = ERDMA_OP_READ_WITH_INV;
|
||||
read_sqe->invalid_stag =
|
||||
cpu_to_le32(send_wr->ex.invalidate_rkey);
|
||||
}
|
||||
|
||||
wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op);
|
||||
rdma_wr = container_of(send_wr, struct ib_rdma_wr, wr);
|
||||
read_sqe->length = cpu_to_le32(send_wr->sg_list[0].length);
|
||||
read_sqe->sink_stag = cpu_to_le32(send_wr->sg_list[0].lkey);
|
||||
read_sqe->sink_to_l =
|
||||
cpu_to_le32(lower_32_bits(send_wr->sg_list[0].addr));
|
||||
read_sqe->sink_to_h =
|
||||
cpu_to_le32(upper_32_bits(send_wr->sg_list[0].addr));
|
||||
|
||||
sge = get_queue_entry(qp->kern_qp.sq_buf, idx + 1,
|
||||
qp->attrs.sq_size, SQEBB_SHIFT);
|
||||
sge->addr = rdma_wr->remote_addr;
|
||||
sge->lkey = rdma_wr->rkey;
|
||||
sge->length = send_wr->sg_list[0].length;
|
||||
wqe_size = sizeof(struct erdma_readreq_sqe) +
|
||||
send_wr->num_sge * sizeof(struct ib_sge);
|
||||
|
||||
goto out;
|
||||
case IB_WR_SEND:
|
||||
case IB_WR_SEND_WITH_IMM:
|
||||
case IB_WR_SEND_WITH_INV:
|
||||
send_sqe = (struct erdma_send_sqe *)entry;
|
||||
hw_op = ERDMA_OP_SEND;
|
||||
if (op == IB_WR_SEND_WITH_IMM) {
|
||||
hw_op = ERDMA_OP_SEND_WITH_IMM;
|
||||
send_sqe->imm_data = send_wr->ex.imm_data;
|
||||
} else if (op == IB_WR_SEND_WITH_INV) {
|
||||
hw_op = ERDMA_OP_SEND_WITH_INV;
|
||||
send_sqe->invalid_stag =
|
||||
cpu_to_le32(send_wr->ex.invalidate_rkey);
|
||||
}
|
||||
wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op);
|
||||
length_field = &send_sqe->length;
|
||||
wqe_size = sizeof(struct erdma_send_sqe);
|
||||
sgl_offset = wqe_size;
|
||||
|
||||
break;
|
||||
case IB_WR_REG_MR:
|
||||
wqe_hdr |=
|
||||
FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, ERDMA_OP_REG_MR);
|
||||
regmr_sge = (struct erdma_reg_mr_sqe *)entry;
|
||||
mr = to_emr(reg_wr(send_wr)->mr);
|
||||
|
||||
mr->access = ERDMA_MR_ACC_LR |
|
||||
to_erdma_access_flags(reg_wr(send_wr)->access);
|
||||
regmr_sge->addr = cpu_to_le64(mr->ibmr.iova);
|
||||
regmr_sge->length = cpu_to_le32(mr->ibmr.length);
|
||||
regmr_sge->stag = cpu_to_le32(mr->ibmr.lkey);
|
||||
attrs = FIELD_PREP(ERDMA_SQE_MR_MODE_MASK, 0) |
|
||||
FIELD_PREP(ERDMA_SQE_MR_ACCESS_MASK, mr->access) |
|
||||
FIELD_PREP(ERDMA_SQE_MR_MTT_CNT_MASK,
|
||||
mr->mem.mtt_nents);
|
||||
|
||||
if (mr->mem.mtt_nents < ERDMA_MAX_INLINE_MTT_ENTRIES) {
|
||||
attrs |= FIELD_PREP(ERDMA_SQE_MR_MTT_TYPE_MASK, 0);
|
||||
/* Copy SGLs to SQE content to accelerate */
|
||||
memcpy(get_queue_entry(qp->kern_qp.sq_buf, idx + 1,
|
||||
qp->attrs.sq_size, SQEBB_SHIFT),
|
||||
mr->mem.mtt_buf, MTT_SIZE(mr->mem.mtt_nents));
|
||||
wqe_size = sizeof(struct erdma_reg_mr_sqe) +
|
||||
MTT_SIZE(mr->mem.mtt_nents);
|
||||
} else {
|
||||
attrs |= FIELD_PREP(ERDMA_SQE_MR_MTT_TYPE_MASK, 1);
|
||||
wqe_size = sizeof(struct erdma_reg_mr_sqe);
|
||||
}
|
||||
|
||||
regmr_sge->attrs = cpu_to_le32(attrs);
|
||||
goto out;
|
||||
case IB_WR_LOCAL_INV:
|
||||
wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK,
|
||||
ERDMA_OP_LOCAL_INV);
|
||||
regmr_sge = (struct erdma_reg_mr_sqe *)entry;
|
||||
regmr_sge->stag = cpu_to_le32(send_wr->ex.invalidate_rkey);
|
||||
wqe_size = sizeof(struct erdma_reg_mr_sqe);
|
||||
goto out;
|
||||
default:
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
if (flags & IB_SEND_INLINE) {
|
||||
ret = fill_inline_data(qp, send_wr, idx, sgl_offset,
|
||||
length_field);
|
||||
if (ret < 0)
|
||||
return -EINVAL;
|
||||
wqe_size += ret;
|
||||
wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, ret);
|
||||
} else {
|
||||
ret = fill_sgl(qp, send_wr, idx, sgl_offset, length_field);
|
||||
if (ret)
|
||||
return -EINVAL;
|
||||
wqe_size += send_wr->num_sge * sizeof(struct ib_sge);
|
||||
wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK,
|
||||
send_wr->num_sge);
|
||||
}
|
||||
|
||||
out:
|
||||
wqebb_cnt = SQEBB_COUNT(wqe_size);
|
||||
wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_CNT_MASK, wqebb_cnt - 1);
|
||||
*pi += wqebb_cnt;
|
||||
wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, *pi);
|
||||
|
||||
*entry = wqe_hdr;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void kick_sq_db(struct erdma_qp *qp, u16 pi)
|
||||
{
|
||||
u64 db_data = FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, QP_ID(qp)) |
|
||||
FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, pi);
|
||||
|
||||
*(u64 *)qp->kern_qp.sq_db_info = db_data;
|
||||
writeq(db_data, qp->kern_qp.hw_sq_db);
|
||||
}
|
||||
|
||||
int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr,
|
||||
const struct ib_send_wr **bad_send_wr)
|
||||
{
|
||||
struct erdma_qp *qp = to_eqp(ibqp);
|
||||
int ret = 0;
|
||||
const struct ib_send_wr *wr = send_wr;
|
||||
unsigned long flags;
|
||||
u16 sq_pi;
|
||||
|
||||
if (!send_wr)
|
||||
return -EINVAL;
|
||||
|
||||
spin_lock_irqsave(&qp->lock, flags);
|
||||
sq_pi = qp->kern_qp.sq_pi;
|
||||
|
||||
while (wr) {
|
||||
if ((u16)(sq_pi - qp->kern_qp.sq_ci) >= qp->attrs.sq_size) {
|
||||
ret = -ENOMEM;
|
||||
*bad_send_wr = send_wr;
|
||||
break;
|
||||
}
|
||||
|
||||
ret = erdma_push_one_sqe(qp, &sq_pi, wr);
|
||||
if (ret) {
|
||||
*bad_send_wr = wr;
|
||||
break;
|
||||
}
|
||||
qp->kern_qp.sq_pi = sq_pi;
|
||||
kick_sq_db(qp, sq_pi);
|
||||
|
||||
wr = wr->next;
|
||||
}
|
||||
spin_unlock_irqrestore(&qp->lock, flags);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int erdma_post_recv_one(struct erdma_qp *qp,
|
||||
const struct ib_recv_wr *recv_wr)
|
||||
{
|
||||
struct erdma_rqe *rqe =
|
||||
get_queue_entry(qp->kern_qp.rq_buf, qp->kern_qp.rq_pi,
|
||||
qp->attrs.rq_size, RQE_SHIFT);
|
||||
|
||||
rqe->qe_idx = cpu_to_le16(qp->kern_qp.rq_pi + 1);
|
||||
rqe->qpn = cpu_to_le32(QP_ID(qp));
|
||||
|
||||
if (recv_wr->num_sge == 0) {
|
||||
rqe->length = 0;
|
||||
} else if (recv_wr->num_sge == 1) {
|
||||
rqe->stag = cpu_to_le32(recv_wr->sg_list[0].lkey);
|
||||
rqe->to = cpu_to_le64(recv_wr->sg_list[0].addr);
|
||||
rqe->length = cpu_to_le32(recv_wr->sg_list[0].length);
|
||||
} else {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
*(u64 *)qp->kern_qp.rq_db_info = *(u64 *)rqe;
|
||||
writeq(*(u64 *)rqe, qp->kern_qp.hw_rq_db);
|
||||
|
||||
qp->kern_qp.rwr_tbl[qp->kern_qp.rq_pi & (qp->attrs.rq_size - 1)] =
|
||||
recv_wr->wr_id;
|
||||
qp->kern_qp.rq_pi++;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr,
|
||||
const struct ib_recv_wr **bad_recv_wr)
|
||||
{
|
||||
const struct ib_recv_wr *wr = recv_wr;
|
||||
struct erdma_qp *qp = to_eqp(ibqp);
|
||||
unsigned long flags;
|
||||
int ret;
|
||||
|
||||
spin_lock_irqsave(&qp->lock, flags);
|
||||
|
||||
while (wr) {
|
||||
ret = erdma_post_recv_one(qp, wr);
|
||||
if (ret) {
|
||||
*bad_recv_wr = wr;
|
||||
break;
|
||||
}
|
||||
wr = wr->next;
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&qp->lock, flags);
|
||||
return ret;
|
||||
}
|
||||
1460
drivers/infiniband/hw/erdma/erdma_verbs.c
Normal file
1460
drivers/infiniband/hw/erdma/erdma_verbs.c
Normal file
File diff suppressed because it is too large
Load Diff
342
drivers/infiniband/hw/erdma/erdma_verbs.h
Normal file
342
drivers/infiniband/hw/erdma/erdma_verbs.h
Normal file
@@ -0,0 +1,342 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
|
||||
|
||||
/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
|
||||
/* Kai Shen <kaishen@linux.alibaba.com> */
|
||||
/* Copyright (c) 2020-2022, Alibaba Group. */
|
||||
|
||||
#ifndef __ERDMA_VERBS_H__
|
||||
#define __ERDMA_VERBS_H__
|
||||
|
||||
#include <linux/errno.h>
|
||||
|
||||
#include <rdma/ib_verbs.h>
|
||||
#include <rdma/ib_user_verbs.h>
|
||||
#include <rdma/iw_cm.h>
|
||||
|
||||
#include "erdma.h"
|
||||
#include "erdma_cm.h"
|
||||
#include "erdma_hw.h"
|
||||
|
||||
/* RDMA Capability. */
|
||||
#define ERDMA_MAX_PD (128 * 1024)
|
||||
#define ERDMA_MAX_SEND_WR 4096
|
||||
#define ERDMA_MAX_ORD 128
|
||||
#define ERDMA_MAX_IRD 128
|
||||
#define ERDMA_MAX_SGE_RD 1
|
||||
#define ERDMA_MAX_CONTEXT (128 * 1024)
|
||||
#define ERDMA_MAX_SEND_SGE 6
|
||||
#define ERDMA_MAX_RECV_SGE 1
|
||||
#define ERDMA_MAX_INLINE (sizeof(struct erdma_sge) * (ERDMA_MAX_SEND_SGE))
|
||||
#define ERDMA_MAX_FRMR_PA 512
|
||||
|
||||
enum {
|
||||
ERDMA_MMAP_IO_NC = 0, /* no cache */
|
||||
};
|
||||
|
||||
struct erdma_user_mmap_entry {
|
||||
struct rdma_user_mmap_entry rdma_entry;
|
||||
u64 address;
|
||||
u8 mmap_flag;
|
||||
};
|
||||
|
||||
struct erdma_ucontext {
|
||||
struct ib_ucontext ibucontext;
|
||||
|
||||
u32 sdb_type;
|
||||
u32 sdb_idx;
|
||||
u32 sdb_page_idx;
|
||||
u32 sdb_page_off;
|
||||
u64 sdb;
|
||||
u64 rdb;
|
||||
u64 cdb;
|
||||
|
||||
struct rdma_user_mmap_entry *sq_db_mmap_entry;
|
||||
struct rdma_user_mmap_entry *rq_db_mmap_entry;
|
||||
struct rdma_user_mmap_entry *cq_db_mmap_entry;
|
||||
|
||||
/* doorbell records */
|
||||
struct list_head dbrecords_page_list;
|
||||
struct mutex dbrecords_page_mutex;
|
||||
};
|
||||
|
||||
struct erdma_pd {
|
||||
struct ib_pd ibpd;
|
||||
u32 pdn;
|
||||
};
|
||||
|
||||
/*
|
||||
* MemoryRegion definition.
|
||||
*/
|
||||
#define ERDMA_MAX_INLINE_MTT_ENTRIES 4
|
||||
#define MTT_SIZE(mtt_cnt) (mtt_cnt << 3) /* per mtt takes 8 Bytes. */
|
||||
#define ERDMA_MR_MAX_MTT_CNT 524288
|
||||
#define ERDMA_MTT_ENTRY_SIZE 8
|
||||
|
||||
#define ERDMA_MR_TYPE_NORMAL 0
|
||||
#define ERDMA_MR_TYPE_FRMR 1
|
||||
#define ERDMA_MR_TYPE_DMA 2
|
||||
|
||||
#define ERDMA_MR_INLINE_MTT 0
|
||||
#define ERDMA_MR_INDIRECT_MTT 1
|
||||
|
||||
#define ERDMA_MR_ACC_LR BIT(0)
|
||||
#define ERDMA_MR_ACC_LW BIT(1)
|
||||
#define ERDMA_MR_ACC_RR BIT(2)
|
||||
#define ERDMA_MR_ACC_RW BIT(3)
|
||||
|
||||
static inline u8 to_erdma_access_flags(int access)
|
||||
{
|
||||
return (access & IB_ACCESS_REMOTE_READ ? ERDMA_MR_ACC_RR : 0) |
|
||||
(access & IB_ACCESS_LOCAL_WRITE ? ERDMA_MR_ACC_LW : 0) |
|
||||
(access & IB_ACCESS_REMOTE_WRITE ? ERDMA_MR_ACC_RW : 0);
|
||||
}
|
||||
|
||||
struct erdma_mem {
|
||||
struct ib_umem *umem;
|
||||
void *mtt_buf;
|
||||
u32 mtt_type;
|
||||
u32 page_size;
|
||||
u32 page_offset;
|
||||
u32 page_cnt;
|
||||
u32 mtt_nents;
|
||||
|
||||
u64 va;
|
||||
u64 len;
|
||||
|
||||
u64 mtt_entry[ERDMA_MAX_INLINE_MTT_ENTRIES];
|
||||
};
|
||||
|
||||
struct erdma_mr {
|
||||
struct ib_mr ibmr;
|
||||
struct erdma_mem mem;
|
||||
u8 type;
|
||||
u8 access;
|
||||
u8 valid;
|
||||
};
|
||||
|
||||
struct erdma_user_dbrecords_page {
|
||||
struct list_head list;
|
||||
struct ib_umem *umem;
|
||||
u64 va;
|
||||
int refcnt;
|
||||
};
|
||||
|
||||
struct erdma_uqp {
|
||||
struct erdma_mem sq_mtt;
|
||||
struct erdma_mem rq_mtt;
|
||||
|
||||
dma_addr_t sq_db_info_dma_addr;
|
||||
dma_addr_t rq_db_info_dma_addr;
|
||||
|
||||
struct erdma_user_dbrecords_page *user_dbr_page;
|
||||
|
||||
u32 rq_offset;
|
||||
};
|
||||
|
||||
struct erdma_kqp {
|
||||
u16 sq_pi;
|
||||
u16 sq_ci;
|
||||
|
||||
u16 rq_pi;
|
||||
u16 rq_ci;
|
||||
|
||||
u64 *swr_tbl;
|
||||
u64 *rwr_tbl;
|
||||
|
||||
void __iomem *hw_sq_db;
|
||||
void __iomem *hw_rq_db;
|
||||
|
||||
void *sq_buf;
|
||||
dma_addr_t sq_buf_dma_addr;
|
||||
|
||||
void *rq_buf;
|
||||
dma_addr_t rq_buf_dma_addr;
|
||||
|
||||
void *sq_db_info;
|
||||
void *rq_db_info;
|
||||
|
||||
u8 sig_all;
|
||||
};
|
||||
|
||||
enum erdma_qp_state {
|
||||
ERDMA_QP_STATE_IDLE = 0,
|
||||
ERDMA_QP_STATE_RTR = 1,
|
||||
ERDMA_QP_STATE_RTS = 2,
|
||||
ERDMA_QP_STATE_CLOSING = 3,
|
||||
ERDMA_QP_STATE_TERMINATE = 4,
|
||||
ERDMA_QP_STATE_ERROR = 5,
|
||||
ERDMA_QP_STATE_UNDEF = 7,
|
||||
ERDMA_QP_STATE_COUNT = 8
|
||||
};
|
||||
|
||||
enum erdma_qp_attr_mask {
|
||||
ERDMA_QP_ATTR_STATE = (1 << 0),
|
||||
ERDMA_QP_ATTR_LLP_HANDLE = (1 << 2),
|
||||
ERDMA_QP_ATTR_ORD = (1 << 3),
|
||||
ERDMA_QP_ATTR_IRD = (1 << 4),
|
||||
ERDMA_QP_ATTR_SQ_SIZE = (1 << 5),
|
||||
ERDMA_QP_ATTR_RQ_SIZE = (1 << 6),
|
||||
ERDMA_QP_ATTR_MPA = (1 << 7)
|
||||
};
|
||||
|
||||
struct erdma_qp_attrs {
|
||||
enum erdma_qp_state state;
|
||||
enum erdma_cc_alg cc; /* Congestion control algorithm */
|
||||
u32 sq_size;
|
||||
u32 rq_size;
|
||||
u32 orq_size;
|
||||
u32 irq_size;
|
||||
u32 max_send_sge;
|
||||
u32 max_recv_sge;
|
||||
u32 cookie;
|
||||
#define ERDMA_QP_ACTIVE 0
|
||||
#define ERDMA_QP_PASSIVE 1
|
||||
u8 qp_type;
|
||||
u8 pd_len;
|
||||
};
|
||||
|
||||
struct erdma_qp {
|
||||
struct ib_qp ibqp;
|
||||
struct kref ref;
|
||||
struct completion safe_free;
|
||||
struct erdma_dev *dev;
|
||||
struct erdma_cep *cep;
|
||||
struct rw_semaphore state_lock;
|
||||
|
||||
union {
|
||||
struct erdma_kqp kern_qp;
|
||||
struct erdma_uqp user_qp;
|
||||
};
|
||||
|
||||
struct erdma_cq *scq;
|
||||
struct erdma_cq *rcq;
|
||||
|
||||
struct erdma_qp_attrs attrs;
|
||||
spinlock_t lock;
|
||||
};
|
||||
|
||||
struct erdma_kcq_info {
|
||||
void *qbuf;
|
||||
dma_addr_t qbuf_dma_addr;
|
||||
u32 ci;
|
||||
u32 cmdsn;
|
||||
u32 notify_cnt;
|
||||
|
||||
spinlock_t lock;
|
||||
u8 __iomem *db;
|
||||
u64 *db_record;
|
||||
};
|
||||
|
||||
struct erdma_ucq_info {
|
||||
struct erdma_mem qbuf_mtt;
|
||||
struct erdma_user_dbrecords_page *user_dbr_page;
|
||||
dma_addr_t db_info_dma_addr;
|
||||
};
|
||||
|
||||
struct erdma_cq {
|
||||
struct ib_cq ibcq;
|
||||
u32 cqn;
|
||||
|
||||
u32 depth;
|
||||
u32 assoc_eqn;
|
||||
|
||||
union {
|
||||
struct erdma_kcq_info kern_cq;
|
||||
struct erdma_ucq_info user_cq;
|
||||
};
|
||||
};
|
||||
|
||||
#define QP_ID(qp) ((qp)->ibqp.qp_num)
|
||||
|
||||
static inline struct erdma_qp *find_qp_by_qpn(struct erdma_dev *dev, int id)
|
||||
{
|
||||
return (struct erdma_qp *)xa_load(&dev->qp_xa, id);
|
||||
}
|
||||
|
||||
static inline struct erdma_cq *find_cq_by_cqn(struct erdma_dev *dev, int id)
|
||||
{
|
||||
return (struct erdma_cq *)xa_load(&dev->cq_xa, id);
|
||||
}
|
||||
|
||||
void erdma_qp_get(struct erdma_qp *qp);
|
||||
void erdma_qp_put(struct erdma_qp *qp);
|
||||
int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs,
|
||||
enum erdma_qp_attr_mask mask);
|
||||
void erdma_qp_llp_close(struct erdma_qp *qp);
|
||||
void erdma_qp_cm_drop(struct erdma_qp *qp);
|
||||
|
||||
static inline struct erdma_ucontext *to_ectx(struct ib_ucontext *ibctx)
|
||||
{
|
||||
return container_of(ibctx, struct erdma_ucontext, ibucontext);
|
||||
}
|
||||
|
||||
static inline struct erdma_pd *to_epd(struct ib_pd *pd)
|
||||
{
|
||||
return container_of(pd, struct erdma_pd, ibpd);
|
||||
}
|
||||
|
||||
static inline struct erdma_mr *to_emr(struct ib_mr *ibmr)
|
||||
{
|
||||
return container_of(ibmr, struct erdma_mr, ibmr);
|
||||
}
|
||||
|
||||
static inline struct erdma_qp *to_eqp(struct ib_qp *qp)
|
||||
{
|
||||
return container_of(qp, struct erdma_qp, ibqp);
|
||||
}
|
||||
|
||||
static inline struct erdma_cq *to_ecq(struct ib_cq *ibcq)
|
||||
{
|
||||
return container_of(ibcq, struct erdma_cq, ibcq);
|
||||
}
|
||||
|
||||
static inline struct erdma_user_mmap_entry *
|
||||
to_emmap(struct rdma_user_mmap_entry *ibmmap)
|
||||
{
|
||||
return container_of(ibmmap, struct erdma_user_mmap_entry, rdma_entry);
|
||||
}
|
||||
|
||||
int erdma_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *data);
|
||||
void erdma_dealloc_ucontext(struct ib_ucontext *ibctx);
|
||||
int erdma_query_device(struct ib_device *dev, struct ib_device_attr *attr,
|
||||
struct ib_udata *data);
|
||||
int erdma_get_port_immutable(struct ib_device *dev, u32 port,
|
||||
struct ib_port_immutable *ib_port_immutable);
|
||||
int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
|
||||
struct ib_udata *data);
|
||||
int erdma_query_port(struct ib_device *dev, u32 port,
|
||||
struct ib_port_attr *attr);
|
||||
int erdma_query_gid(struct ib_device *dev, u32 port, int idx,
|
||||
union ib_gid *gid);
|
||||
int erdma_alloc_pd(struct ib_pd *ibpd, struct ib_udata *data);
|
||||
int erdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
|
||||
int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr,
|
||||
struct ib_udata *data);
|
||||
int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask,
|
||||
struct ib_qp_init_attr *init_attr);
|
||||
int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask,
|
||||
struct ib_udata *data);
|
||||
int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
|
||||
int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
|
||||
int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
|
||||
struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
|
||||
u64 virt, int access, struct ib_udata *udata);
|
||||
struct ib_mr *erdma_get_dma_mr(struct ib_pd *ibpd, int rights);
|
||||
int erdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *data);
|
||||
int erdma_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma);
|
||||
void erdma_mmap_free(struct rdma_user_mmap_entry *rdma_entry);
|
||||
void erdma_qp_get_ref(struct ib_qp *ibqp);
|
||||
void erdma_qp_put_ref(struct ib_qp *ibqp);
|
||||
struct ib_qp *erdma_get_ibqp(struct ib_device *dev, int id);
|
||||
int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr,
|
||||
const struct ib_send_wr **bad_send_wr);
|
||||
int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr,
|
||||
const struct ib_recv_wr **bad_recv_wr);
|
||||
int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
|
||||
struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
|
||||
u32 max_num_sg);
|
||||
int erdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
|
||||
unsigned int *sg_offset);
|
||||
void erdma_port_event(struct erdma_dev *dev, enum ib_event_type reason);
|
||||
|
||||
#endif
|
||||
@@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
config INFINIBAND_HFI1
|
||||
tristate "Cornelis OPX Gen1 support"
|
||||
depends on X86_64 && INFINIBAND_RDMAVT && I2C
|
||||
depends on X86_64 && INFINIBAND_RDMAVT && I2C && !UML
|
||||
select MMU_NOTIFIER
|
||||
select CRC32
|
||||
select I2C_ALGOBIT
|
||||
|
||||
@@ -1179,8 +1179,10 @@ static int setup_base_ctxt(struct hfi1_filedata *fd,
|
||||
goto done;
|
||||
|
||||
ret = init_user_ctxt(fd, uctxt);
|
||||
if (ret)
|
||||
if (ret) {
|
||||
hfi1_free_ctxt_rcv_groups(uctxt);
|
||||
goto done;
|
||||
}
|
||||
|
||||
user_init(uctxt);
|
||||
|
||||
|
||||
@@ -742,9 +742,7 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv)
|
||||
kzalloc_node(sizeof(*tx->sdma_hdr),
|
||||
GFP_KERNEL, priv->dd->node);
|
||||
|
||||
netif_tx_napi_add(dev, &txq->napi,
|
||||
hfi1_ipoib_poll_tx_ring,
|
||||
NAPI_POLL_WEIGHT);
|
||||
netif_napi_add_tx(dev, &txq->napi, hfi1_ipoib_poll_tx_ring);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -216,7 +216,7 @@ static int hfi1_netdev_rxq_init(struct hfi1_netdev_rx *rx)
|
||||
* right now.
|
||||
*/
|
||||
set_bit(NAPI_STATE_NO_BUSY_POLL, &rxq->napi.state);
|
||||
netif_napi_add(dev, &rxq->napi, hfi1_netdev_rx_napi, 64);
|
||||
netif_napi_add_weight(dev, &rxq->napi, hfi1_netdev_rx_napi, 64);
|
||||
rc = msix_netdev_request_rcd_irq(rxq->rcd);
|
||||
if (rc)
|
||||
goto bail_context_irq_failure;
|
||||
|
||||
@@ -172,7 +172,7 @@ static inline void jcopy(u8 *dest, const u8 *src, u32 n)
|
||||
}
|
||||
|
||||
/*
|
||||
* Read nbytes from "from" and and place them in the low bytes
|
||||
* Read nbytes from "from" and place them in the low bytes
|
||||
* of pbuf->carry. Other bytes are left as-is. Any previous
|
||||
* value in pbuf->carry is lost.
|
||||
*
|
||||
|
||||
@@ -959,6 +959,7 @@ struct hns_roce_dev {
|
||||
const struct hns_roce_hw *hw;
|
||||
void *priv;
|
||||
struct workqueue_struct *irq_workq;
|
||||
struct work_struct ecc_work;
|
||||
const struct hns_roce_dfx_hw *dfx;
|
||||
u32 func_num;
|
||||
u32 is_vf;
|
||||
|
||||
@@ -55,6 +55,42 @@ enum {
|
||||
CMD_RST_PRC_EBUSY,
|
||||
};
|
||||
|
||||
enum ecc_resource_type {
|
||||
ECC_RESOURCE_QPC,
|
||||
ECC_RESOURCE_CQC,
|
||||
ECC_RESOURCE_MPT,
|
||||
ECC_RESOURCE_SRQC,
|
||||
ECC_RESOURCE_GMV,
|
||||
ECC_RESOURCE_QPC_TIMER,
|
||||
ECC_RESOURCE_CQC_TIMER,
|
||||
ECC_RESOURCE_SCCC,
|
||||
ECC_RESOURCE_COUNT,
|
||||
};
|
||||
|
||||
static const struct {
|
||||
const char *name;
|
||||
u8 read_bt0_op;
|
||||
u8 write_bt0_op;
|
||||
} fmea_ram_res[] = {
|
||||
{ "ECC_RESOURCE_QPC",
|
||||
HNS_ROCE_CMD_READ_QPC_BT0, HNS_ROCE_CMD_WRITE_QPC_BT0 },
|
||||
{ "ECC_RESOURCE_CQC",
|
||||
HNS_ROCE_CMD_READ_CQC_BT0, HNS_ROCE_CMD_WRITE_CQC_BT0 },
|
||||
{ "ECC_RESOURCE_MPT",
|
||||
HNS_ROCE_CMD_READ_MPT_BT0, HNS_ROCE_CMD_WRITE_MPT_BT0 },
|
||||
{ "ECC_RESOURCE_SRQC",
|
||||
HNS_ROCE_CMD_READ_SRQC_BT0, HNS_ROCE_CMD_WRITE_SRQC_BT0 },
|
||||
/* ECC_RESOURCE_GMV is handled by cmdq, not mailbox */
|
||||
{ "ECC_RESOURCE_GMV",
|
||||
0, 0 },
|
||||
{ "ECC_RESOURCE_QPC_TIMER",
|
||||
HNS_ROCE_CMD_READ_QPC_TIMER_BT0, HNS_ROCE_CMD_WRITE_QPC_TIMER_BT0 },
|
||||
{ "ECC_RESOURCE_CQC_TIMER",
|
||||
HNS_ROCE_CMD_READ_CQC_TIMER_BT0, HNS_ROCE_CMD_WRITE_CQC_TIMER_BT0 },
|
||||
{ "ECC_RESOURCE_SCCC",
|
||||
HNS_ROCE_CMD_READ_SCCC_BT0, HNS_ROCE_CMD_WRITE_SCCC_BT0 },
|
||||
};
|
||||
|
||||
static inline void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg,
|
||||
struct ib_sge *sg)
|
||||
{
|
||||
@@ -5855,12 +5891,12 @@ static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq)
|
||||
!!(eq->cons_index & eq->entries)) ? aeqe : NULL;
|
||||
}
|
||||
|
||||
static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
|
||||
struct hns_roce_eq *eq)
|
||||
static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
|
||||
struct hns_roce_eq *eq)
|
||||
{
|
||||
struct device *dev = hr_dev->dev;
|
||||
struct hns_roce_aeqe *aeqe = next_aeqe_sw_v2(eq);
|
||||
int aeqe_found = 0;
|
||||
irqreturn_t aeqe_found = IRQ_NONE;
|
||||
int event_type;
|
||||
u32 queue_num;
|
||||
int sub_type;
|
||||
@@ -5914,7 +5950,7 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
|
||||
eq->event_type = event_type;
|
||||
eq->sub_type = sub_type;
|
||||
++eq->cons_index;
|
||||
aeqe_found = 1;
|
||||
aeqe_found = IRQ_HANDLED;
|
||||
|
||||
hns_roce_v2_init_irq_work(hr_dev, eq, queue_num);
|
||||
|
||||
@@ -5922,7 +5958,8 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
|
||||
}
|
||||
|
||||
update_eq_db(eq);
|
||||
return aeqe_found;
|
||||
|
||||
return IRQ_RETVAL(aeqe_found);
|
||||
}
|
||||
|
||||
static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq)
|
||||
@@ -5937,11 +5974,11 @@ static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq)
|
||||
!!(eq->cons_index & eq->entries)) ? ceqe : NULL;
|
||||
}
|
||||
|
||||
static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev,
|
||||
struct hns_roce_eq *eq)
|
||||
static irqreturn_t hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev,
|
||||
struct hns_roce_eq *eq)
|
||||
{
|
||||
struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq);
|
||||
int ceqe_found = 0;
|
||||
irqreturn_t ceqe_found = IRQ_NONE;
|
||||
u32 cqn;
|
||||
|
||||
while (ceqe) {
|
||||
@@ -5955,21 +5992,21 @@ static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev,
|
||||
hns_roce_cq_completion(hr_dev, cqn);
|
||||
|
||||
++eq->cons_index;
|
||||
ceqe_found = 1;
|
||||
ceqe_found = IRQ_HANDLED;
|
||||
|
||||
ceqe = next_ceqe_sw_v2(eq);
|
||||
}
|
||||
|
||||
update_eq_db(eq);
|
||||
|
||||
return ceqe_found;
|
||||
return IRQ_RETVAL(ceqe_found);
|
||||
}
|
||||
|
||||
static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr)
|
||||
{
|
||||
struct hns_roce_eq *eq = eq_ptr;
|
||||
struct hns_roce_dev *hr_dev = eq->hr_dev;
|
||||
int int_work;
|
||||
irqreturn_t int_work;
|
||||
|
||||
if (eq->type_flag == HNS_ROCE_CEQ)
|
||||
/* Completion event interrupt */
|
||||
@@ -5981,27 +6018,22 @@ static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr)
|
||||
return IRQ_RETVAL(int_work);
|
||||
}
|
||||
|
||||
static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id)
|
||||
static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev,
|
||||
u32 int_st)
|
||||
{
|
||||
struct hns_roce_dev *hr_dev = dev_id;
|
||||
struct device *dev = hr_dev->dev;
|
||||
int int_work = 0;
|
||||
u32 int_st;
|
||||
struct pci_dev *pdev = hr_dev->pci_dev;
|
||||
struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
|
||||
const struct hnae3_ae_ops *ops = ae_dev->ops;
|
||||
irqreturn_t int_work = IRQ_NONE;
|
||||
u32 int_en;
|
||||
|
||||
/* Abnormal interrupt */
|
||||
int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG);
|
||||
int_en = roce_read(hr_dev, ROCEE_VF_ABN_INT_EN_REG);
|
||||
|
||||
if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S)) {
|
||||
struct pci_dev *pdev = hr_dev->pci_dev;
|
||||
struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
|
||||
const struct hnae3_ae_ops *ops = ae_dev->ops;
|
||||
dev_err(hr_dev->dev, "AEQ overflow!\n");
|
||||
|
||||
dev_err(dev, "AEQ overflow!\n");
|
||||
|
||||
int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S;
|
||||
roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st);
|
||||
roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG,
|
||||
1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S);
|
||||
|
||||
/* Set reset level for reset_event() */
|
||||
if (ops->set_default_reset_request)
|
||||
@@ -6013,19 +6045,165 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id)
|
||||
int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S;
|
||||
roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en);
|
||||
|
||||
int_work = 1;
|
||||
} else if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_RAS_INT_S)) {
|
||||
dev_err(dev, "RAS interrupt!\n");
|
||||
|
||||
int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_RAS_INT_S;
|
||||
roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st);
|
||||
|
||||
int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S;
|
||||
roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en);
|
||||
|
||||
int_work = 1;
|
||||
int_work = IRQ_HANDLED;
|
||||
} else {
|
||||
dev_err(dev, "There is no abnormal irq found!\n");
|
||||
dev_err(hr_dev->dev, "there is no basic abn irq found.\n");
|
||||
}
|
||||
|
||||
return IRQ_RETVAL(int_work);
|
||||
}
|
||||
|
||||
static int fmea_ram_ecc_query(struct hns_roce_dev *hr_dev,
|
||||
struct fmea_ram_ecc *ecc_info)
|
||||
{
|
||||
struct hns_roce_cmq_desc desc;
|
||||
struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data;
|
||||
int ret;
|
||||
|
||||
hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_QUERY_RAM_ECC, true);
|
||||
ret = hns_roce_cmq_send(hr_dev, &desc, 1);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ecc_info->is_ecc_err = hr_reg_read(req, QUERY_RAM_ECC_1BIT_ERR);
|
||||
ecc_info->res_type = hr_reg_read(req, QUERY_RAM_ECC_RES_TYPE);
|
||||
ecc_info->index = hr_reg_read(req, QUERY_RAM_ECC_TAG);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int fmea_recover_gmv(struct hns_roce_dev *hr_dev, u32 idx)
|
||||
{
|
||||
struct hns_roce_cmq_desc desc;
|
||||
struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data;
|
||||
u32 addr_upper;
|
||||
u32 addr_low;
|
||||
int ret;
|
||||
|
||||
hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, true);
|
||||
hr_reg_write(req, CFG_GMV_BT_IDX, idx);
|
||||
|
||||
ret = hns_roce_cmq_send(hr_dev, &desc, 1);
|
||||
if (ret) {
|
||||
dev_err(hr_dev->dev,
|
||||
"failed to execute cmd to read gmv, ret = %d.\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
addr_low = hr_reg_read(req, CFG_GMV_BT_BA_L);
|
||||
addr_upper = hr_reg_read(req, CFG_GMV_BT_BA_H);
|
||||
|
||||
hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, false);
|
||||
hr_reg_write(req, CFG_GMV_BT_BA_L, addr_low);
|
||||
hr_reg_write(req, CFG_GMV_BT_BA_H, addr_upper);
|
||||
hr_reg_write(req, CFG_GMV_BT_IDX, idx);
|
||||
|
||||
return hns_roce_cmq_send(hr_dev, &desc, 1);
|
||||
}
|
||||
|
||||
static u64 fmea_get_ram_res_addr(u32 res_type, __le64 *data)
|
||||
{
|
||||
if (res_type == ECC_RESOURCE_QPC_TIMER ||
|
||||
res_type == ECC_RESOURCE_CQC_TIMER ||
|
||||
res_type == ECC_RESOURCE_SCCC)
|
||||
return le64_to_cpu(*data);
|
||||
|
||||
return le64_to_cpu(*data) << PAGE_SHIFT;
|
||||
}
|
||||
|
||||
static int fmea_recover_others(struct hns_roce_dev *hr_dev, u32 res_type,
|
||||
u32 index)
|
||||
{
|
||||
u8 write_bt0_op = fmea_ram_res[res_type].write_bt0_op;
|
||||
u8 read_bt0_op = fmea_ram_res[res_type].read_bt0_op;
|
||||
struct hns_roce_cmd_mailbox *mailbox;
|
||||
u64 addr;
|
||||
int ret;
|
||||
|
||||
mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
|
||||
if (IS_ERR(mailbox))
|
||||
return PTR_ERR(mailbox);
|
||||
|
||||
ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, read_bt0_op, index);
|
||||
if (ret) {
|
||||
dev_err(hr_dev->dev,
|
||||
"failed to execute cmd to read fmea ram, ret = %d.\n",
|
||||
ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
addr = fmea_get_ram_res_addr(res_type, mailbox->buf);
|
||||
|
||||
ret = hns_roce_cmd_mbox(hr_dev, addr, 0, write_bt0_op, index);
|
||||
if (ret)
|
||||
dev_err(hr_dev->dev,
|
||||
"failed to execute cmd to write fmea ram, ret = %d.\n",
|
||||
ret);
|
||||
|
||||
out:
|
||||
hns_roce_free_cmd_mailbox(hr_dev, mailbox);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void fmea_ram_ecc_recover(struct hns_roce_dev *hr_dev,
|
||||
struct fmea_ram_ecc *ecc_info)
|
||||
{
|
||||
u32 res_type = ecc_info->res_type;
|
||||
u32 index = ecc_info->index;
|
||||
int ret;
|
||||
|
||||
BUILD_BUG_ON(ARRAY_SIZE(fmea_ram_res) != ECC_RESOURCE_COUNT);
|
||||
|
||||
if (res_type >= ECC_RESOURCE_COUNT) {
|
||||
dev_err(hr_dev->dev, "unsupported fmea ram ecc type %u.\n",
|
||||
res_type);
|
||||
return;
|
||||
}
|
||||
|
||||
if (res_type == ECC_RESOURCE_GMV)
|
||||
ret = fmea_recover_gmv(hr_dev, index);
|
||||
else
|
||||
ret = fmea_recover_others(hr_dev, res_type, index);
|
||||
if (ret)
|
||||
dev_err(hr_dev->dev,
|
||||
"failed to recover %s, index = %u, ret = %d.\n",
|
||||
fmea_ram_res[res_type].name, index, ret);
|
||||
}
|
||||
|
||||
static void fmea_ram_ecc_work(struct work_struct *ecc_work)
|
||||
{
|
||||
struct hns_roce_dev *hr_dev =
|
||||
container_of(ecc_work, struct hns_roce_dev, ecc_work);
|
||||
struct fmea_ram_ecc ecc_info = {};
|
||||
|
||||
if (fmea_ram_ecc_query(hr_dev, &ecc_info)) {
|
||||
dev_err(hr_dev->dev, "failed to query fmea ram ecc.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!ecc_info.is_ecc_err) {
|
||||
dev_err(hr_dev->dev, "there is no fmea ram ecc err found.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
fmea_ram_ecc_recover(hr_dev, &ecc_info);
|
||||
}
|
||||
|
||||
static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id)
|
||||
{
|
||||
struct hns_roce_dev *hr_dev = dev_id;
|
||||
irqreturn_t int_work = IRQ_NONE;
|
||||
u32 int_st;
|
||||
|
||||
int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG);
|
||||
|
||||
if (int_st) {
|
||||
int_work = abnormal_interrupt_basic(hr_dev, int_st);
|
||||
} else if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) {
|
||||
queue_work(hr_dev->irq_workq, &hr_dev->ecc_work);
|
||||
int_work = IRQ_HANDLED;
|
||||
} else {
|
||||
dev_err(hr_dev->dev, "there is no abnormal irq found.\n");
|
||||
}
|
||||
|
||||
return IRQ_RETVAL(int_work);
|
||||
@@ -6342,6 +6520,8 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
|
||||
}
|
||||
}
|
||||
|
||||
INIT_WORK(&hr_dev->ecc_work, fmea_ram_ecc_work);
|
||||
|
||||
hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", 0);
|
||||
if (!hr_dev->irq_workq) {
|
||||
dev_err(dev, "failed to create irq workqueue.\n");
|
||||
|
||||
@@ -250,6 +250,7 @@ enum hns_roce_opcode_type {
|
||||
HNS_ROCE_OPC_CFG_GMV_TBL = 0x850f,
|
||||
HNS_ROCE_OPC_CFG_GMV_BT = 0x8510,
|
||||
HNS_ROCE_OPC_EXT_CFG = 0x8512,
|
||||
HNS_ROCE_QUERY_RAM_ECC = 0x8513,
|
||||
HNS_SWITCH_PARAMETER_CFG = 0x1033,
|
||||
};
|
||||
|
||||
@@ -1107,6 +1108,11 @@ enum {
|
||||
#define CFG_GMV_BT_BA_H CMQ_REQ_FIELD_LOC(51, 32)
|
||||
#define CFG_GMV_BT_IDX CMQ_REQ_FIELD_LOC(95, 64)
|
||||
|
||||
/* Fields of HNS_ROCE_QUERY_RAM_ECC */
|
||||
#define QUERY_RAM_ECC_1BIT_ERR CMQ_REQ_FIELD_LOC(31, 0)
|
||||
#define QUERY_RAM_ECC_RES_TYPE CMQ_REQ_FIELD_LOC(63, 32)
|
||||
#define QUERY_RAM_ECC_TAG CMQ_REQ_FIELD_LOC(95, 64)
|
||||
|
||||
struct hns_roce_cfg_sgid_tb {
|
||||
__le32 table_idx_rsv;
|
||||
__le32 vf_sgid_l;
|
||||
@@ -1343,6 +1349,12 @@ struct hns_roce_dip {
|
||||
struct list_head node; /* all dips are on a list */
|
||||
};
|
||||
|
||||
struct fmea_ram_ecc {
|
||||
u32 is_ecc_err;
|
||||
u32 res_type;
|
||||
u32 index;
|
||||
};
|
||||
|
||||
/* only for RNR timeout issue of HIP08 */
|
||||
#define HNS_ROCE_CLOCK_ADJUST 1000
|
||||
#define HNS_ROCE_MAX_CQ_PERIOD 65
|
||||
@@ -1382,7 +1394,6 @@ struct hns_roce_dip {
|
||||
#define HNS_ROCE_V2_ASYNC_EQE_NUM 0x1000
|
||||
|
||||
#define HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S 0
|
||||
#define HNS_ROCE_V2_VF_INT_ST_RAS_INT_S 1
|
||||
|
||||
#define HNS_ROCE_EQ_DB_CMD_AEQ 0x0
|
||||
#define HNS_ROCE_EQ_DB_CMD_AEQ_ARMED 0x1
|
||||
|
||||
@@ -1477,12 +1477,13 @@ irdma_find_listener(struct irdma_cm_core *cm_core, u32 *dst_addr, u16 dst_port,
|
||||
list_for_each_entry (listen_node, &cm_core->listen_list, list) {
|
||||
memcpy(listen_addr, listen_node->loc_addr, sizeof(listen_addr));
|
||||
listen_port = listen_node->loc_port;
|
||||
if (listen_port != dst_port ||
|
||||
!(listener_state & listen_node->listener_state))
|
||||
continue;
|
||||
/* compare node pair, return node handle if a match */
|
||||
if ((!memcmp(listen_addr, dst_addr, sizeof(listen_addr)) ||
|
||||
!memcmp(listen_addr, ip_zero, sizeof(listen_addr))) &&
|
||||
listen_port == dst_port &&
|
||||
vlan_id == listen_node->vlan_id &&
|
||||
(listener_state & listen_node->listener_state)) {
|
||||
if (!memcmp(listen_addr, ip_zero, sizeof(listen_addr)) ||
|
||||
(!memcmp(listen_addr, dst_addr, sizeof(listen_addr)) &&
|
||||
vlan_id == listen_node->vlan_id)) {
|
||||
refcount_inc(&listen_node->refcnt);
|
||||
spin_unlock_irqrestore(&cm_core->listen_list_lock,
|
||||
flags);
|
||||
|
||||
@@ -4872,10 +4872,12 @@ int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count)
|
||||
|
||||
sd_diff = sd_needed - hmc_fpm_misc->max_sds;
|
||||
if (sd_diff > 128) {
|
||||
if (qpwanted > 128 && sd_diff > 144)
|
||||
if (!(loop_count % 2) && qpwanted > 128) {
|
||||
qpwanted /= 2;
|
||||
mrwanted /= 2;
|
||||
pblewanted /= 2;
|
||||
} else {
|
||||
mrwanted /= 2;
|
||||
pblewanted /= 2;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (dev->cqp->hmc_profile != IRDMA_HMC_PROFILE_FAVOR_VF &&
|
||||
|
||||
@@ -257,10 +257,6 @@ static void irdma_process_aeq(struct irdma_pci_f *rf)
|
||||
iwqp->last_aeq = info->ae_id;
|
||||
spin_unlock_irqrestore(&iwqp->lock, flags);
|
||||
ctx_info = &iwqp->ctx_info;
|
||||
if (rdma_protocol_roce(&iwqp->iwdev->ibdev, 1))
|
||||
ctx_info->roce_info->err_rq_idx_valid = true;
|
||||
else
|
||||
ctx_info->iwarp_info->err_rq_idx_valid = true;
|
||||
} else {
|
||||
if (info->ae_id != IRDMA_AE_CQ_OPERATION_ERROR)
|
||||
continue;
|
||||
@@ -370,16 +366,12 @@ static void irdma_process_aeq(struct irdma_pci_f *rf)
|
||||
case IRDMA_AE_LCE_FUNCTION_CATASTROPHIC:
|
||||
case IRDMA_AE_LCE_CQ_CATASTROPHIC:
|
||||
case IRDMA_AE_UDA_XMIT_DGRAM_TOO_LONG:
|
||||
if (rdma_protocol_roce(&iwdev->ibdev, 1))
|
||||
ctx_info->roce_info->err_rq_idx_valid = false;
|
||||
else
|
||||
ctx_info->iwarp_info->err_rq_idx_valid = false;
|
||||
fallthrough;
|
||||
default:
|
||||
ibdev_err(&iwdev->ibdev, "abnormal ae_id = 0x%x bool qp=%d qp_id = %d\n",
|
||||
info->ae_id, info->qp, info->qp_cq_id);
|
||||
ibdev_err(&iwdev->ibdev, "abnormal ae_id = 0x%x bool qp=%d qp_id = %d, ae_src=%d\n",
|
||||
info->ae_id, info->qp, info->qp_cq_id, info->ae_src);
|
||||
if (rdma_protocol_roce(&iwdev->ibdev, 1)) {
|
||||
if (!info->sq && ctx_info->roce_info->err_rq_idx_valid) {
|
||||
ctx_info->roce_info->err_rq_idx_valid = info->rq;
|
||||
if (info->rq) {
|
||||
ctx_info->roce_info->err_rq_idx = info->wqe_idx;
|
||||
irdma_sc_qp_setctx_roce(&iwqp->sc_qp, iwqp->host_ctx.va,
|
||||
ctx_info);
|
||||
@@ -388,7 +380,8 @@ static void irdma_process_aeq(struct irdma_pci_f *rf)
|
||||
irdma_cm_disconn(iwqp);
|
||||
break;
|
||||
}
|
||||
if (!info->sq && ctx_info->iwarp_info->err_rq_idx_valid) {
|
||||
ctx_info->iwarp_info->err_rq_idx_valid = info->rq;
|
||||
if (info->rq) {
|
||||
ctx_info->iwarp_info->err_rq_idx = info->wqe_idx;
|
||||
ctx_info->tcp_info_valid = false;
|
||||
ctx_info->iwarp_info_valid = true;
|
||||
@@ -1512,10 +1505,7 @@ static int irdma_hmc_setup(struct irdma_pci_f *rf)
|
||||
int status;
|
||||
u32 qpcnt;
|
||||
|
||||
if (rf->rdma_ver == IRDMA_GEN_1)
|
||||
qpcnt = rsrc_limits_table[rf->limits_sel].qplimit * 2;
|
||||
else
|
||||
qpcnt = rsrc_limits_table[rf->limits_sel].qplimit;
|
||||
qpcnt = rsrc_limits_table[rf->limits_sel].qplimit;
|
||||
|
||||
rf->sd_type = IRDMA_SD_TYPE_DIRECT;
|
||||
status = irdma_cfg_fpm_val(&rf->sc_dev, qpcnt);
|
||||
@@ -1543,7 +1533,7 @@ static void irdma_del_init_mem(struct irdma_pci_f *rf)
|
||||
rf->obj_mem.pa);
|
||||
rf->obj_mem.va = NULL;
|
||||
if (rf->rdma_ver != IRDMA_GEN_1) {
|
||||
kfree(rf->allocated_ws_nodes);
|
||||
bitmap_free(rf->allocated_ws_nodes);
|
||||
rf->allocated_ws_nodes = NULL;
|
||||
}
|
||||
kfree(rf->ceqlist);
|
||||
@@ -1972,9 +1962,8 @@ u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf)
|
||||
u32 ret;
|
||||
|
||||
if (rf->rdma_ver != IRDMA_GEN_1) {
|
||||
rf->allocated_ws_nodes =
|
||||
kcalloc(BITS_TO_LONGS(IRDMA_MAX_WS_NODES),
|
||||
sizeof(unsigned long), GFP_KERNEL);
|
||||
rf->allocated_ws_nodes = bitmap_zalloc(IRDMA_MAX_WS_NODES,
|
||||
GFP_KERNEL);
|
||||
if (!rf->allocated_ws_nodes)
|
||||
return -ENOMEM;
|
||||
|
||||
@@ -2023,7 +2012,7 @@ u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf)
|
||||
return 0;
|
||||
|
||||
mem_rsrc_kzalloc_fail:
|
||||
kfree(rf->allocated_ws_nodes);
|
||||
bitmap_free(rf->allocated_ws_nodes);
|
||||
rf->allocated_ws_nodes = NULL;
|
||||
|
||||
return ret;
|
||||
|
||||
@@ -85,7 +85,7 @@ extern struct auxiliary_driver i40iw_auxiliary_drv;
|
||||
#define IRDMA_NO_QSET 0xffff
|
||||
|
||||
#define IW_CFG_FPM_QP_COUNT 32768
|
||||
#define IRDMA_MAX_PAGES_PER_FMR 512
|
||||
#define IRDMA_MAX_PAGES_PER_FMR 262144
|
||||
#define IRDMA_MIN_PAGES_PER_FMR 1
|
||||
#define IRDMA_CQP_COMPL_RQ_WQE_FLUSHED 2
|
||||
#define IRDMA_CQP_COMPL_SQ_WQE_FLUSHED 3
|
||||
|
||||
@@ -652,6 +652,7 @@ static const char *const irdma_cqp_cmd_names[IRDMA_MAX_CQP_OPS] = {
|
||||
};
|
||||
|
||||
static const struct irdma_cqp_err_info irdma_noncrit_err_list[] = {
|
||||
{0xffff, 0x8002, "Invalid State"},
|
||||
{0xffff, 0x8006, "Flush No Wqe Pending"},
|
||||
{0xffff, 0x8007, "Modify QP Bad Close"},
|
||||
{0xffff, 0x8009, "LLP Closed"},
|
||||
|
||||
@@ -1776,11 +1776,11 @@ static int irdma_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
|
||||
spin_unlock_irqrestore(&iwcq->lock, flags);
|
||||
|
||||
irdma_cq_wq_destroy(iwdev->rf, cq);
|
||||
irdma_cq_free_rsrc(iwdev->rf, iwcq);
|
||||
|
||||
spin_lock_irqsave(&iwceq->ce_lock, flags);
|
||||
irdma_sc_cleanup_ceqes(cq, ceq);
|
||||
spin_unlock_irqrestore(&iwceq->ce_lock, flags);
|
||||
irdma_cq_free_rsrc(iwdev->rf, iwcq);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -2605,7 +2605,7 @@ static struct ib_mr *irdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
|
||||
palloc = &iwpbl->pble_alloc;
|
||||
iwmr->page_cnt = max_num_sg;
|
||||
err_code = irdma_get_pble(iwdev->rf->pble_rsrc, palloc, iwmr->page_cnt,
|
||||
true);
|
||||
false);
|
||||
if (err_code)
|
||||
goto err_get_pble;
|
||||
|
||||
@@ -2641,8 +2641,16 @@ static int irdma_set_page(struct ib_mr *ibmr, u64 addr)
|
||||
if (unlikely(iwmr->npages == iwmr->page_cnt))
|
||||
return -ENOMEM;
|
||||
|
||||
pbl = palloc->level1.addr;
|
||||
pbl[iwmr->npages++] = addr;
|
||||
if (palloc->level == PBLE_LEVEL_2) {
|
||||
struct irdma_pble_info *palloc_info =
|
||||
palloc->level2.leaf + (iwmr->npages >> PBLE_512_SHIFT);
|
||||
|
||||
palloc_info->addr[iwmr->npages & (PBLE_PER_PAGE - 1)] = addr;
|
||||
} else {
|
||||
pbl = palloc->level1.addr;
|
||||
pbl[iwmr->npages] = addr;
|
||||
}
|
||||
iwmr->npages++;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -523,6 +523,10 @@ repoll:
|
||||
"Requestor" : "Responder", cq->mcq.cqn);
|
||||
mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n",
|
||||
err_cqe->syndrome, err_cqe->vendor_err_synd);
|
||||
if (wc->status != IB_WC_WR_FLUSH_ERR &&
|
||||
(*cur_qp)->type == MLX5_IB_QPT_REG_UMR)
|
||||
dev->umrc.state = MLX5_UMR_STATE_RECOVER;
|
||||
|
||||
if (opcode == MLX5_CQE_REQ_ERR) {
|
||||
wq = &(*cur_qp)->sq;
|
||||
wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
|
||||
|
||||
@@ -679,7 +679,15 @@ enum flow_table_type {
|
||||
#define MLX5_FS_MAX_TYPES 6
|
||||
#define MLX5_FS_MAX_ENTRIES BIT(16)
|
||||
|
||||
static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns,
|
||||
static bool mlx5_ib_shared_ft_allowed(struct ib_device *device)
|
||||
{
|
||||
struct mlx5_ib_dev *dev = to_mdev(device);
|
||||
|
||||
return MLX5_CAP_GEN(dev->mdev, shared_object_to_user_object_allowed);
|
||||
}
|
||||
|
||||
static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev,
|
||||
struct mlx5_flow_namespace *ns,
|
||||
struct mlx5_ib_flow_prio *prio,
|
||||
int priority,
|
||||
int num_entries, int num_groups,
|
||||
@@ -688,6 +696,8 @@ static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns,
|
||||
struct mlx5_flow_table_attr ft_attr = {};
|
||||
struct mlx5_flow_table *ft;
|
||||
|
||||
if (mlx5_ib_shared_ft_allowed(&dev->ib_dev))
|
||||
ft_attr.uid = MLX5_SHARED_RESOURCE_UID;
|
||||
ft_attr.prio = priority;
|
||||
ft_attr.max_fte = num_entries;
|
||||
ft_attr.flags = flags;
|
||||
@@ -784,8 +794,8 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
|
||||
|
||||
ft = prio->flow_table;
|
||||
if (!ft)
|
||||
return _get_prio(ns, prio, priority, max_table_size, num_groups,
|
||||
flags);
|
||||
return _get_prio(dev, ns, prio, priority, max_table_size,
|
||||
num_groups, flags);
|
||||
|
||||
return prio;
|
||||
}
|
||||
@@ -927,7 +937,7 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num,
|
||||
|
||||
prio = &dev->flow_db->opfcs[type];
|
||||
if (!prio->flow_table) {
|
||||
prio = _get_prio(ns, prio, priority,
|
||||
prio = _get_prio(dev, ns, prio, priority,
|
||||
dev->num_ports * MAX_OPFC_RULES, 1, 0);
|
||||
if (IS_ERR(prio)) {
|
||||
err = PTR_ERR(prio);
|
||||
@@ -1407,8 +1417,8 @@ free_ucmd:
|
||||
}
|
||||
|
||||
static struct mlx5_ib_flow_prio *
|
||||
_get_flow_table(struct mlx5_ib_dev *dev,
|
||||
struct mlx5_ib_flow_matcher *fs_matcher,
|
||||
_get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority,
|
||||
enum mlx5_flow_namespace_type ns_type,
|
||||
bool mcast)
|
||||
{
|
||||
struct mlx5_flow_namespace *ns = NULL;
|
||||
@@ -1421,11 +1431,11 @@ _get_flow_table(struct mlx5_ib_dev *dev,
|
||||
if (mcast)
|
||||
priority = MLX5_IB_FLOW_MCAST_PRIO;
|
||||
else
|
||||
priority = ib_prio_to_core_prio(fs_matcher->priority, false);
|
||||
priority = ib_prio_to_core_prio(user_priority, false);
|
||||
|
||||
esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
|
||||
DEVLINK_ESWITCH_ENCAP_MODE_NONE;
|
||||
switch (fs_matcher->ns_type) {
|
||||
switch (ns_type) {
|
||||
case MLX5_FLOW_NAMESPACE_BYPASS:
|
||||
max_table_size = BIT(
|
||||
MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, log_max_ft_size));
|
||||
@@ -1452,17 +1462,17 @@ _get_flow_table(struct mlx5_ib_dev *dev,
|
||||
reformat_l3_tunnel_to_l2) &&
|
||||
esw_encap)
|
||||
flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
|
||||
priority = fs_matcher->priority;
|
||||
priority = user_priority;
|
||||
break;
|
||||
case MLX5_FLOW_NAMESPACE_RDMA_RX:
|
||||
max_table_size = BIT(
|
||||
MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev, log_max_ft_size));
|
||||
priority = fs_matcher->priority;
|
||||
priority = user_priority;
|
||||
break;
|
||||
case MLX5_FLOW_NAMESPACE_RDMA_TX:
|
||||
max_table_size = BIT(
|
||||
MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev, log_max_ft_size));
|
||||
priority = fs_matcher->priority;
|
||||
priority = user_priority;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -1470,11 +1480,11 @@ _get_flow_table(struct mlx5_ib_dev *dev,
|
||||
|
||||
max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES);
|
||||
|
||||
ns = mlx5_get_flow_namespace(dev->mdev, fs_matcher->ns_type);
|
||||
ns = mlx5_get_flow_namespace(dev->mdev, ns_type);
|
||||
if (!ns)
|
||||
return ERR_PTR(-EOPNOTSUPP);
|
||||
|
||||
switch (fs_matcher->ns_type) {
|
||||
switch (ns_type) {
|
||||
case MLX5_FLOW_NAMESPACE_BYPASS:
|
||||
prio = &dev->flow_db->prios[priority];
|
||||
break;
|
||||
@@ -1499,7 +1509,7 @@ _get_flow_table(struct mlx5_ib_dev *dev,
|
||||
if (prio->flow_table)
|
||||
return prio;
|
||||
|
||||
return _get_prio(ns, prio, priority, max_table_size,
|
||||
return _get_prio(dev, ns, prio, priority, max_table_size,
|
||||
MLX5_FS_MAX_TYPES, flags);
|
||||
}
|
||||
|
||||
@@ -1618,7 +1628,8 @@ static struct mlx5_ib_flow_handler *raw_fs_rule_add(
|
||||
mcast = raw_fs_is_multicast(fs_matcher, cmd_in);
|
||||
mutex_lock(&dev->flow_db->lock);
|
||||
|
||||
ft_prio = _get_flow_table(dev, fs_matcher, mcast);
|
||||
ft_prio = _get_flow_table(dev, fs_matcher->priority,
|
||||
fs_matcher->ns_type, mcast);
|
||||
if (IS_ERR(ft_prio)) {
|
||||
err = PTR_ERR(ft_prio);
|
||||
goto unlock;
|
||||
@@ -2015,6 +2026,23 @@ static int flow_matcher_cleanup(struct ib_uobject *uobject,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int steering_anchor_cleanup(struct ib_uobject *uobject,
|
||||
enum rdma_remove_reason why,
|
||||
struct uverbs_attr_bundle *attrs)
|
||||
{
|
||||
struct mlx5_ib_steering_anchor *obj = uobject->object;
|
||||
|
||||
if (atomic_read(&obj->usecnt))
|
||||
return -EBUSY;
|
||||
|
||||
mutex_lock(&obj->dev->flow_db->lock);
|
||||
put_flow_table(obj->dev, obj->ft_prio, true);
|
||||
mutex_unlock(&obj->dev->flow_db->lock);
|
||||
|
||||
kfree(obj);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs,
|
||||
struct mlx5_ib_flow_matcher *obj)
|
||||
{
|
||||
@@ -2050,12 +2078,10 @@ static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs,
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (flags) {
|
||||
mlx5_ib_ft_type_to_namespace(
|
||||
if (flags)
|
||||
return mlx5_ib_ft_type_to_namespace(
|
||||
MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX,
|
||||
&obj->ns_type);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
obj->ns_type = MLX5_FLOW_NAMESPACE_BYPASS;
|
||||
@@ -2121,6 +2147,75 @@ end:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)(
|
||||
struct uverbs_attr_bundle *attrs)
|
||||
{
|
||||
struct ib_uobject *uobj = uverbs_attr_get_uobject(
|
||||
attrs, MLX5_IB_ATTR_STEERING_ANCHOR_CREATE_HANDLE);
|
||||
struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata);
|
||||
enum mlx5_ib_uapi_flow_table_type ib_uapi_ft_type;
|
||||
enum mlx5_flow_namespace_type ns_type;
|
||||
struct mlx5_ib_steering_anchor *obj;
|
||||
struct mlx5_ib_flow_prio *ft_prio;
|
||||
u16 priority;
|
||||
u32 ft_id;
|
||||
int err;
|
||||
|
||||
if (!capable(CAP_NET_RAW))
|
||||
return -EPERM;
|
||||
|
||||
err = uverbs_get_const(&ib_uapi_ft_type, attrs,
|
||||
MLX5_IB_ATTR_STEERING_ANCHOR_FT_TYPE);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = mlx5_ib_ft_type_to_namespace(ib_uapi_ft_type, &ns_type);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = uverbs_copy_from(&priority, attrs,
|
||||
MLX5_IB_ATTR_STEERING_ANCHOR_PRIORITY);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
obj = kzalloc(sizeof(*obj), GFP_KERNEL);
|
||||
if (!obj)
|
||||
return -ENOMEM;
|
||||
|
||||
mutex_lock(&dev->flow_db->lock);
|
||||
ft_prio = _get_flow_table(dev, priority, ns_type, 0);
|
||||
if (IS_ERR(ft_prio)) {
|
||||
mutex_unlock(&dev->flow_db->lock);
|
||||
err = PTR_ERR(ft_prio);
|
||||
goto free_obj;
|
||||
}
|
||||
|
||||
ft_prio->refcount++;
|
||||
ft_id = mlx5_flow_table_id(ft_prio->flow_table);
|
||||
mutex_unlock(&dev->flow_db->lock);
|
||||
|
||||
err = uverbs_copy_to(attrs, MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID,
|
||||
&ft_id, sizeof(ft_id));
|
||||
if (err)
|
||||
goto put_flow_table;
|
||||
|
||||
uobj->object = obj;
|
||||
obj->dev = dev;
|
||||
obj->ft_prio = ft_prio;
|
||||
atomic_set(&obj->usecnt, 0);
|
||||
|
||||
return 0;
|
||||
|
||||
put_flow_table:
|
||||
mutex_lock(&dev->flow_db->lock);
|
||||
put_flow_table(dev, ft_prio, true);
|
||||
mutex_unlock(&dev->flow_db->lock);
|
||||
free_obj:
|
||||
kfree(obj);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct ib_flow_action *
|
||||
mlx5_ib_create_modify_header(struct mlx5_ib_dev *dev,
|
||||
enum mlx5_ib_uapi_flow_table_type ft_type,
|
||||
@@ -2477,6 +2572,35 @@ DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_FLOW_MATCHER,
|
||||
&UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_CREATE),
|
||||
&UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_DESTROY));
|
||||
|
||||
DECLARE_UVERBS_NAMED_METHOD(
|
||||
MLX5_IB_METHOD_STEERING_ANCHOR_CREATE,
|
||||
UVERBS_ATTR_IDR(MLX5_IB_ATTR_STEERING_ANCHOR_CREATE_HANDLE,
|
||||
MLX5_IB_OBJECT_STEERING_ANCHOR,
|
||||
UVERBS_ACCESS_NEW,
|
||||
UA_MANDATORY),
|
||||
UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_STEERING_ANCHOR_FT_TYPE,
|
||||
enum mlx5_ib_uapi_flow_table_type,
|
||||
UA_MANDATORY),
|
||||
UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_STEERING_ANCHOR_PRIORITY,
|
||||
UVERBS_ATTR_TYPE(u16),
|
||||
UA_MANDATORY),
|
||||
UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID,
|
||||
UVERBS_ATTR_TYPE(u32),
|
||||
UA_MANDATORY));
|
||||
|
||||
DECLARE_UVERBS_NAMED_METHOD_DESTROY(
|
||||
MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY,
|
||||
UVERBS_ATTR_IDR(MLX5_IB_ATTR_STEERING_ANCHOR_DESTROY_HANDLE,
|
||||
MLX5_IB_OBJECT_STEERING_ANCHOR,
|
||||
UVERBS_ACCESS_DESTROY,
|
||||
UA_MANDATORY));
|
||||
|
||||
DECLARE_UVERBS_NAMED_OBJECT(
|
||||
MLX5_IB_OBJECT_STEERING_ANCHOR,
|
||||
UVERBS_TYPE_ALLOC_IDR(steering_anchor_cleanup),
|
||||
&UVERBS_METHOD(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE),
|
||||
&UVERBS_METHOD(MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY));
|
||||
|
||||
const struct uapi_definition mlx5_ib_flow_defs[] = {
|
||||
UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
|
||||
MLX5_IB_OBJECT_FLOW_MATCHER),
|
||||
@@ -2485,6 +2609,9 @@ const struct uapi_definition mlx5_ib_flow_defs[] = {
|
||||
&mlx5_ib_fs),
|
||||
UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
|
||||
&mlx5_ib_flow_actions),
|
||||
UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
|
||||
MLX5_IB_OBJECT_STEERING_ANCHOR,
|
||||
UAPI_DEF_IS_OBJ_SUPPORTED(mlx5_ib_shared_ft_allowed)),
|
||||
{},
|
||||
};
|
||||
|
||||
|
||||
@@ -4002,7 +4002,7 @@ static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = mlx5_mr_cache_cleanup(dev);
|
||||
err = mlx5_mkey_cache_cleanup(dev);
|
||||
if (err)
|
||||
mlx5_ib_warn(dev, "mr cache cleanup failed\n");
|
||||
|
||||
@@ -4022,7 +4022,7 @@ static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = mlx5_mr_cache_init(dev);
|
||||
ret = mlx5_mkey_cache_init(dev);
|
||||
if (ret) {
|
||||
mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
|
||||
mlx5r_umr_resource_cleanup(dev);
|
||||
|
||||
@@ -259,6 +259,12 @@ struct mlx5_ib_flow_matcher {
|
||||
u8 match_criteria_enable;
|
||||
};
|
||||
|
||||
struct mlx5_ib_steering_anchor {
|
||||
struct mlx5_ib_flow_prio *ft_prio;
|
||||
struct mlx5_ib_dev *dev;
|
||||
atomic_t usecnt;
|
||||
};
|
||||
|
||||
struct mlx5_ib_pp {
|
||||
u16 index;
|
||||
struct mlx5_core_dev *mdev;
|
||||
@@ -613,6 +619,7 @@ struct mlx5_ib_mkey {
|
||||
unsigned int ndescs;
|
||||
struct wait_queue_head wait;
|
||||
refcount_t usecount;
|
||||
struct mlx5_cache_ent *cache_ent;
|
||||
};
|
||||
|
||||
#define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)
|
||||
@@ -635,20 +642,9 @@ struct mlx5_ib_mr {
|
||||
struct ib_mr ibmr;
|
||||
struct mlx5_ib_mkey mmkey;
|
||||
|
||||
/* User MR data */
|
||||
struct mlx5_cache_ent *cache_ent;
|
||||
/* Everything after cache_ent is zero'd when MR allocated */
|
||||
struct ib_umem *umem;
|
||||
|
||||
union {
|
||||
/* Used only while the MR is in the cache */
|
||||
struct {
|
||||
u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
|
||||
struct mlx5_async_work cb_work;
|
||||
/* Cache list element */
|
||||
struct list_head list;
|
||||
};
|
||||
|
||||
/* Used only by kernel MRs (umem == NULL) */
|
||||
struct {
|
||||
void *descs;
|
||||
@@ -688,12 +684,6 @@ struct mlx5_ib_mr {
|
||||
};
|
||||
};
|
||||
|
||||
/* Zero the fields in the mr that are variant depending on usage */
|
||||
static inline void mlx5_clear_mr(struct mlx5_ib_mr *mr)
|
||||
{
|
||||
memset_after(mr, 0, cache_ent);
|
||||
}
|
||||
|
||||
static inline bool is_odp_mr(struct mlx5_ib_mr *mr)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem &&
|
||||
@@ -717,21 +707,29 @@ struct mlx5_ib_umr_context {
|
||||
struct completion done;
|
||||
};
|
||||
|
||||
enum {
|
||||
MLX5_UMR_STATE_ACTIVE,
|
||||
MLX5_UMR_STATE_RECOVER,
|
||||
MLX5_UMR_STATE_ERR,
|
||||
};
|
||||
|
||||
struct umr_common {
|
||||
struct ib_pd *pd;
|
||||
struct ib_cq *cq;
|
||||
struct ib_qp *qp;
|
||||
/* control access to UMR QP
|
||||
/* Protects from UMR QP overflow
|
||||
*/
|
||||
struct semaphore sem;
|
||||
/* Protects from using UMR while the UMR is not active
|
||||
*/
|
||||
struct mutex lock;
|
||||
unsigned int state;
|
||||
};
|
||||
|
||||
struct mlx5_cache_ent {
|
||||
struct list_head head;
|
||||
/* sync access to the cahce entry
|
||||
*/
|
||||
spinlock_t lock;
|
||||
|
||||
struct xarray mkeys;
|
||||
unsigned long stored;
|
||||
unsigned long reserved;
|
||||
|
||||
char name[4];
|
||||
u32 order;
|
||||
@@ -743,18 +741,11 @@ struct mlx5_cache_ent {
|
||||
u8 fill_to_high_water:1;
|
||||
|
||||
/*
|
||||
* - available_mrs is the length of list head, ie the number of MRs
|
||||
* available for immediate allocation.
|
||||
* - total_mrs is available_mrs plus all in use MRs that could be
|
||||
* returned to the cache.
|
||||
* - limit is the low water mark for available_mrs, 2* limit is the
|
||||
* - limit is the low water mark for stored mkeys, 2* limit is the
|
||||
* upper water mark.
|
||||
* - pending is the number of MRs currently being created
|
||||
*/
|
||||
u32 total_mrs;
|
||||
u32 available_mrs;
|
||||
u32 in_use;
|
||||
u32 limit;
|
||||
u32 pending;
|
||||
|
||||
/* Statistics */
|
||||
u32 miss;
|
||||
@@ -763,9 +754,19 @@ struct mlx5_cache_ent {
|
||||
struct delayed_work dwork;
|
||||
};
|
||||
|
||||
struct mlx5_mr_cache {
|
||||
struct mlx5r_async_create_mkey {
|
||||
union {
|
||||
u32 in[MLX5_ST_SZ_BYTES(create_mkey_in)];
|
||||
u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
|
||||
};
|
||||
struct mlx5_async_work cb_work;
|
||||
struct mlx5_cache_ent *ent;
|
||||
u32 mkey;
|
||||
};
|
||||
|
||||
struct mlx5_mkey_cache {
|
||||
struct workqueue_struct *wq;
|
||||
struct mlx5_cache_ent ent[MAX_MR_CACHE_ENTRIES];
|
||||
struct mlx5_cache_ent ent[MAX_MKEY_CACHE_ENTRIES];
|
||||
struct dentry *root;
|
||||
unsigned long last_add;
|
||||
};
|
||||
@@ -1064,7 +1065,7 @@ struct mlx5_ib_dev {
|
||||
struct mlx5_ib_resources devr;
|
||||
|
||||
atomic_t mkey_var;
|
||||
struct mlx5_mr_cache cache;
|
||||
struct mlx5_mkey_cache cache;
|
||||
struct timer_list delay_timer;
|
||||
/* Prevents soft lock on massive reg MRs */
|
||||
struct mutex slow_path_mutex;
|
||||
@@ -1309,8 +1310,8 @@ void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas,
|
||||
u64 access_flags);
|
||||
void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
|
||||
int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
|
||||
int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
|
||||
int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
|
||||
int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev);
|
||||
int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev);
|
||||
|
||||
struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
|
||||
struct mlx5_cache_ent *ent,
|
||||
@@ -1338,7 +1339,7 @@ int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq);
|
||||
void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
|
||||
int __init mlx5_ib_odp_init(void);
|
||||
void mlx5_ib_odp_cleanup(void);
|
||||
void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
|
||||
void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent);
|
||||
void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
|
||||
struct mlx5_ib_mr *mr, int flags);
|
||||
|
||||
@@ -1357,7 +1358,7 @@ static inline int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev,
|
||||
static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {}
|
||||
static inline int mlx5_ib_odp_init(void) { return 0; }
|
||||
static inline void mlx5_ib_odp_cleanup(void) {}
|
||||
static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}
|
||||
static inline void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) {}
|
||||
static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
|
||||
struct mlx5_ib_mr *mr, int flags) {}
|
||||
|
||||
|
||||
@@ -82,15 +82,14 @@ static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
|
||||
MLX5_SET64(mkc, mkc, start_addr, start_addr);
|
||||
}
|
||||
|
||||
static void assign_mkey_variant(struct mlx5_ib_dev *dev,
|
||||
struct mlx5_ib_mkey *mkey, u32 *in)
|
||||
static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in)
|
||||
{
|
||||
u8 key = atomic_inc_return(&dev->mkey_var);
|
||||
void *mkc;
|
||||
|
||||
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
|
||||
MLX5_SET(mkc, mkc, mkey_7_0, key);
|
||||
mkey->key = key;
|
||||
*mkey = key;
|
||||
}
|
||||
|
||||
static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
|
||||
@@ -98,7 +97,7 @@ static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
|
||||
{
|
||||
int ret;
|
||||
|
||||
assign_mkey_variant(dev, mkey, in);
|
||||
assign_mkey_variant(dev, &mkey->key, in);
|
||||
ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen);
|
||||
if (!ret)
|
||||
init_waitqueue_head(&mkey->wait);
|
||||
@@ -106,20 +105,21 @@ static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int
|
||||
mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
|
||||
struct mlx5_ib_mkey *mkey,
|
||||
struct mlx5_async_ctx *async_ctx,
|
||||
u32 *in, int inlen, u32 *out, int outlen,
|
||||
struct mlx5_async_work *context)
|
||||
static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create)
|
||||
{
|
||||
MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
|
||||
assign_mkey_variant(dev, mkey, in);
|
||||
return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen,
|
||||
create_mkey_callback, context);
|
||||
struct mlx5_ib_dev *dev = async_create->ent->dev;
|
||||
size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
|
||||
size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out);
|
||||
|
||||
MLX5_SET(create_mkey_in, async_create->in, opcode,
|
||||
MLX5_CMD_OP_CREATE_MKEY);
|
||||
assign_mkey_variant(dev, &async_create->mkey, async_create->in);
|
||||
return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen,
|
||||
async_create->out, outlen, create_mkey_callback,
|
||||
&async_create->cb_work);
|
||||
}
|
||||
|
||||
static int mr_cache_max_order(struct mlx5_ib_dev *dev);
|
||||
static int mkey_cache_max_order(struct mlx5_ib_dev *dev);
|
||||
static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
|
||||
|
||||
static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
|
||||
@@ -142,40 +142,132 @@ static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
|
||||
mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out);
|
||||
}
|
||||
|
||||
|
||||
static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
|
||||
void *to_store)
|
||||
{
|
||||
XA_STATE(xas, &ent->mkeys, 0);
|
||||
void *curr;
|
||||
|
||||
xa_lock_irq(&ent->mkeys);
|
||||
if (limit_pendings &&
|
||||
(ent->reserved - ent->stored) > MAX_PENDING_REG_MR) {
|
||||
xa_unlock_irq(&ent->mkeys);
|
||||
return -EAGAIN;
|
||||
}
|
||||
while (1) {
|
||||
/*
|
||||
* This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version
|
||||
* doesn't transparently unlock. Instead we set the xas index to
|
||||
* the current value of reserved every iteration.
|
||||
*/
|
||||
xas_set(&xas, ent->reserved);
|
||||
curr = xas_load(&xas);
|
||||
if (!curr) {
|
||||
if (to_store && ent->stored == ent->reserved)
|
||||
xas_store(&xas, to_store);
|
||||
else
|
||||
xas_store(&xas, XA_ZERO_ENTRY);
|
||||
if (xas_valid(&xas)) {
|
||||
ent->reserved++;
|
||||
if (to_store) {
|
||||
if (ent->stored != ent->reserved)
|
||||
__xa_store(&ent->mkeys,
|
||||
ent->stored,
|
||||
to_store,
|
||||
GFP_KERNEL);
|
||||
ent->stored++;
|
||||
queue_adjust_cache_locked(ent);
|
||||
WRITE_ONCE(ent->dev->cache.last_add,
|
||||
jiffies);
|
||||
}
|
||||
}
|
||||
}
|
||||
xa_unlock_irq(&ent->mkeys);
|
||||
|
||||
/*
|
||||
* Notice xas_nomem() must always be called as it cleans
|
||||
* up any cached allocation.
|
||||
*/
|
||||
if (!xas_nomem(&xas, GFP_KERNEL))
|
||||
break;
|
||||
xa_lock_irq(&ent->mkeys);
|
||||
}
|
||||
if (xas_error(&xas))
|
||||
return xas_error(&xas);
|
||||
if (WARN_ON(curr))
|
||||
return -EINVAL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent)
|
||||
{
|
||||
void *old;
|
||||
|
||||
ent->reserved--;
|
||||
old = __xa_erase(&ent->mkeys, ent->reserved);
|
||||
WARN_ON(old);
|
||||
}
|
||||
|
||||
static void push_to_reserved(struct mlx5_cache_ent *ent, u32 mkey)
|
||||
{
|
||||
void *old;
|
||||
|
||||
old = __xa_store(&ent->mkeys, ent->stored, xa_mk_value(mkey), 0);
|
||||
WARN_ON(old);
|
||||
ent->stored++;
|
||||
}
|
||||
|
||||
static u32 pop_stored_mkey(struct mlx5_cache_ent *ent)
|
||||
{
|
||||
void *old, *xa_mkey;
|
||||
|
||||
ent->stored--;
|
||||
ent->reserved--;
|
||||
|
||||
if (ent->stored == ent->reserved) {
|
||||
xa_mkey = __xa_erase(&ent->mkeys, ent->stored);
|
||||
WARN_ON(!xa_mkey);
|
||||
return (u32)xa_to_value(xa_mkey);
|
||||
}
|
||||
|
||||
xa_mkey = __xa_store(&ent->mkeys, ent->stored, XA_ZERO_ENTRY,
|
||||
GFP_KERNEL);
|
||||
WARN_ON(!xa_mkey || xa_is_err(xa_mkey));
|
||||
old = __xa_erase(&ent->mkeys, ent->reserved);
|
||||
WARN_ON(old);
|
||||
return (u32)xa_to_value(xa_mkey);
|
||||
}
|
||||
|
||||
static void create_mkey_callback(int status, struct mlx5_async_work *context)
|
||||
{
|
||||
struct mlx5_ib_mr *mr =
|
||||
container_of(context, struct mlx5_ib_mr, cb_work);
|
||||
struct mlx5_cache_ent *ent = mr->cache_ent;
|
||||
struct mlx5r_async_create_mkey *mkey_out =
|
||||
container_of(context, struct mlx5r_async_create_mkey, cb_work);
|
||||
struct mlx5_cache_ent *ent = mkey_out->ent;
|
||||
struct mlx5_ib_dev *dev = ent->dev;
|
||||
unsigned long flags;
|
||||
|
||||
if (status) {
|
||||
create_mkey_warn(dev, status, mr->out);
|
||||
kfree(mr);
|
||||
spin_lock_irqsave(&ent->lock, flags);
|
||||
ent->pending--;
|
||||
create_mkey_warn(dev, status, mkey_out->out);
|
||||
kfree(mkey_out);
|
||||
xa_lock_irqsave(&ent->mkeys, flags);
|
||||
undo_push_reserve_mkey(ent);
|
||||
WRITE_ONCE(dev->fill_delay, 1);
|
||||
spin_unlock_irqrestore(&ent->lock, flags);
|
||||
xa_unlock_irqrestore(&ent->mkeys, flags);
|
||||
mod_timer(&dev->delay_timer, jiffies + HZ);
|
||||
return;
|
||||
}
|
||||
|
||||
mr->mmkey.type = MLX5_MKEY_MR;
|
||||
mr->mmkey.key |= mlx5_idx_to_mkey(
|
||||
MLX5_GET(create_mkey_out, mr->out, mkey_index));
|
||||
init_waitqueue_head(&mr->mmkey.wait);
|
||||
|
||||
mkey_out->mkey |= mlx5_idx_to_mkey(
|
||||
MLX5_GET(create_mkey_out, mkey_out->out, mkey_index));
|
||||
WRITE_ONCE(dev->cache.last_add, jiffies);
|
||||
|
||||
spin_lock_irqsave(&ent->lock, flags);
|
||||
list_add_tail(&mr->list, &ent->head);
|
||||
ent->available_mrs++;
|
||||
ent->total_mrs++;
|
||||
xa_lock_irqsave(&ent->mkeys, flags);
|
||||
push_to_reserved(ent, mkey_out->mkey);
|
||||
/* If we are doing fill_to_high_water then keep going. */
|
||||
queue_adjust_cache_locked(ent);
|
||||
ent->pending--;
|
||||
spin_unlock_irqrestore(&ent->lock, flags);
|
||||
xa_unlock_irqrestore(&ent->mkeys, flags);
|
||||
kfree(mkey_out);
|
||||
}
|
||||
|
||||
static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs)
|
||||
@@ -197,15 +289,8 @@ static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
|
||||
static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
|
||||
{
|
||||
struct mlx5_ib_mr *mr;
|
||||
|
||||
mr = kzalloc(sizeof(*mr), GFP_KERNEL);
|
||||
if (!mr)
|
||||
return NULL;
|
||||
mr->cache_ent = ent;
|
||||
|
||||
set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd);
|
||||
MLX5_SET(mkc, mkc, free, 1);
|
||||
MLX5_SET(mkc, mkc, umr_en, 1);
|
||||
@@ -215,133 +300,106 @@ static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
|
||||
MLX5_SET(mkc, mkc, translations_octword_size,
|
||||
get_mkc_octo_size(ent->access_mode, ent->ndescs));
|
||||
MLX5_SET(mkc, mkc, log_page_size, ent->page);
|
||||
return mr;
|
||||
}
|
||||
|
||||
/* Asynchronously schedule new MRs to be populated in the cache. */
|
||||
static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
|
||||
{
|
||||
size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
|
||||
struct mlx5_ib_mr *mr;
|
||||
struct mlx5r_async_create_mkey *async_create;
|
||||
void *mkc;
|
||||
u32 *in;
|
||||
int err = 0;
|
||||
int i;
|
||||
|
||||
in = kzalloc(inlen, GFP_KERNEL);
|
||||
if (!in)
|
||||
return -ENOMEM;
|
||||
|
||||
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
|
||||
for (i = 0; i < num; i++) {
|
||||
mr = alloc_cache_mr(ent, mkc);
|
||||
if (!mr) {
|
||||
err = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
spin_lock_irq(&ent->lock);
|
||||
if (ent->pending >= MAX_PENDING_REG_MR) {
|
||||
err = -EAGAIN;
|
||||
spin_unlock_irq(&ent->lock);
|
||||
kfree(mr);
|
||||
break;
|
||||
}
|
||||
ent->pending++;
|
||||
spin_unlock_irq(&ent->lock);
|
||||
err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey,
|
||||
&ent->dev->async_ctx, in, inlen,
|
||||
mr->out, sizeof(mr->out),
|
||||
&mr->cb_work);
|
||||
async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey),
|
||||
GFP_KERNEL);
|
||||
if (!async_create)
|
||||
return -ENOMEM;
|
||||
mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in,
|
||||
memory_key_mkey_entry);
|
||||
set_cache_mkc(ent, mkc);
|
||||
async_create->ent = ent;
|
||||
|
||||
err = push_mkey(ent, true, NULL);
|
||||
if (err)
|
||||
goto free_async_create;
|
||||
|
||||
err = mlx5_ib_create_mkey_cb(async_create);
|
||||
if (err) {
|
||||
spin_lock_irq(&ent->lock);
|
||||
ent->pending--;
|
||||
spin_unlock_irq(&ent->lock);
|
||||
mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
|
||||
kfree(mr);
|
||||
break;
|
||||
goto err_undo_reserve;
|
||||
}
|
||||
}
|
||||
|
||||
kfree(in);
|
||||
return 0;
|
||||
|
||||
err_undo_reserve:
|
||||
xa_lock_irq(&ent->mkeys);
|
||||
undo_push_reserve_mkey(ent);
|
||||
xa_unlock_irq(&ent->mkeys);
|
||||
free_async_create:
|
||||
kfree(async_create);
|
||||
return err;
|
||||
}
|
||||
|
||||
/* Synchronously create a MR in the cache */
|
||||
static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent)
|
||||
static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey)
|
||||
{
|
||||
size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
|
||||
struct mlx5_ib_mr *mr;
|
||||
void *mkc;
|
||||
u32 *in;
|
||||
int err;
|
||||
|
||||
in = kzalloc(inlen, GFP_KERNEL);
|
||||
if (!in)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
return -ENOMEM;
|
||||
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
|
||||
set_cache_mkc(ent, mkc);
|
||||
|
||||
mr = alloc_cache_mr(ent, mkc);
|
||||
if (!mr) {
|
||||
err = -ENOMEM;
|
||||
goto free_in;
|
||||
}
|
||||
|
||||
err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey.key, in, inlen);
|
||||
err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen);
|
||||
if (err)
|
||||
goto free_mr;
|
||||
goto free_in;
|
||||
|
||||
init_waitqueue_head(&mr->mmkey.wait);
|
||||
mr->mmkey.type = MLX5_MKEY_MR;
|
||||
WRITE_ONCE(ent->dev->cache.last_add, jiffies);
|
||||
spin_lock_irq(&ent->lock);
|
||||
ent->total_mrs++;
|
||||
spin_unlock_irq(&ent->lock);
|
||||
kfree(in);
|
||||
return mr;
|
||||
free_mr:
|
||||
kfree(mr);
|
||||
free_in:
|
||||
kfree(in);
|
||||
return ERR_PTR(err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
|
||||
{
|
||||
struct mlx5_ib_mr *mr;
|
||||
u32 mkey;
|
||||
|
||||
lockdep_assert_held(&ent->lock);
|
||||
if (list_empty(&ent->head))
|
||||
lockdep_assert_held(&ent->mkeys.xa_lock);
|
||||
if (!ent->stored)
|
||||
return;
|
||||
mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
|
||||
list_del(&mr->list);
|
||||
ent->available_mrs--;
|
||||
ent->total_mrs--;
|
||||
spin_unlock_irq(&ent->lock);
|
||||
mlx5_core_destroy_mkey(ent->dev->mdev, mr->mmkey.key);
|
||||
kfree(mr);
|
||||
spin_lock_irq(&ent->lock);
|
||||
mkey = pop_stored_mkey(ent);
|
||||
xa_unlock_irq(&ent->mkeys);
|
||||
mlx5_core_destroy_mkey(ent->dev->mdev, mkey);
|
||||
xa_lock_irq(&ent->mkeys);
|
||||
}
|
||||
|
||||
static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
|
||||
bool limit_fill)
|
||||
__acquires(&ent->mkeys) __releases(&ent->mkeys)
|
||||
{
|
||||
int err;
|
||||
|
||||
lockdep_assert_held(&ent->lock);
|
||||
lockdep_assert_held(&ent->mkeys.xa_lock);
|
||||
|
||||
while (true) {
|
||||
if (limit_fill)
|
||||
target = ent->limit * 2;
|
||||
if (target == ent->available_mrs + ent->pending)
|
||||
if (target == ent->reserved)
|
||||
return 0;
|
||||
if (target > ent->available_mrs + ent->pending) {
|
||||
u32 todo = target - (ent->available_mrs + ent->pending);
|
||||
if (target > ent->reserved) {
|
||||
u32 todo = target - ent->reserved;
|
||||
|
||||
spin_unlock_irq(&ent->lock);
|
||||
xa_unlock_irq(&ent->mkeys);
|
||||
err = add_keys(ent, todo);
|
||||
if (err == -EAGAIN)
|
||||
usleep_range(3000, 5000);
|
||||
spin_lock_irq(&ent->lock);
|
||||
xa_lock_irq(&ent->mkeys);
|
||||
if (err) {
|
||||
if (err != -EAGAIN)
|
||||
return err;
|
||||
@@ -366,15 +424,15 @@ static ssize_t size_write(struct file *filp, const char __user *buf,
|
||||
|
||||
/*
|
||||
* Target is the new value of total_mrs the user requests, however we
|
||||
* cannot free MRs that are in use. Compute the target value for
|
||||
* available_mrs.
|
||||
* cannot free MRs that are in use. Compute the target value for stored
|
||||
* mkeys.
|
||||
*/
|
||||
spin_lock_irq(&ent->lock);
|
||||
if (target < ent->total_mrs - ent->available_mrs) {
|
||||
xa_lock_irq(&ent->mkeys);
|
||||
if (target < ent->in_use) {
|
||||
err = -EINVAL;
|
||||
goto err_unlock;
|
||||
}
|
||||
target = target - (ent->total_mrs - ent->available_mrs);
|
||||
target = target - ent->in_use;
|
||||
if (target < ent->limit || target > ent->limit*2) {
|
||||
err = -EINVAL;
|
||||
goto err_unlock;
|
||||
@@ -382,12 +440,12 @@ static ssize_t size_write(struct file *filp, const char __user *buf,
|
||||
err = resize_available_mrs(ent, target, false);
|
||||
if (err)
|
||||
goto err_unlock;
|
||||
spin_unlock_irq(&ent->lock);
|
||||
xa_unlock_irq(&ent->mkeys);
|
||||
|
||||
return count;
|
||||
|
||||
err_unlock:
|
||||
spin_unlock_irq(&ent->lock);
|
||||
xa_unlock_irq(&ent->mkeys);
|
||||
return err;
|
||||
}
|
||||
|
||||
@@ -398,7 +456,7 @@ static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
|
||||
char lbuf[20];
|
||||
int err;
|
||||
|
||||
err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs);
|
||||
err = snprintf(lbuf, sizeof(lbuf), "%ld\n", ent->stored + ent->in_use);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
@@ -427,10 +485,10 @@ static ssize_t limit_write(struct file *filp, const char __user *buf,
|
||||
* Upon set we immediately fill the cache to high water mark implied by
|
||||
* the limit.
|
||||
*/
|
||||
spin_lock_irq(&ent->lock);
|
||||
xa_lock_irq(&ent->mkeys);
|
||||
ent->limit = var;
|
||||
err = resize_available_mrs(ent, 0, true);
|
||||
spin_unlock_irq(&ent->lock);
|
||||
xa_unlock_irq(&ent->mkeys);
|
||||
if (err)
|
||||
return err;
|
||||
return count;
|
||||
@@ -457,17 +515,17 @@ static const struct file_operations limit_fops = {
|
||||
.read = limit_read,
|
||||
};
|
||||
|
||||
static bool someone_adding(struct mlx5_mr_cache *cache)
|
||||
static bool someone_adding(struct mlx5_mkey_cache *cache)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
|
||||
for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
|
||||
struct mlx5_cache_ent *ent = &cache->ent[i];
|
||||
bool ret;
|
||||
|
||||
spin_lock_irq(&ent->lock);
|
||||
ret = ent->available_mrs < ent->limit;
|
||||
spin_unlock_irq(&ent->lock);
|
||||
xa_lock_irq(&ent->mkeys);
|
||||
ret = ent->stored < ent->limit;
|
||||
xa_unlock_irq(&ent->mkeys);
|
||||
if (ret)
|
||||
return true;
|
||||
}
|
||||
@@ -481,26 +539,26 @@ static bool someone_adding(struct mlx5_mr_cache *cache)
|
||||
*/
|
||||
static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
|
||||
{
|
||||
lockdep_assert_held(&ent->lock);
|
||||
lockdep_assert_held(&ent->mkeys.xa_lock);
|
||||
|
||||
if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
|
||||
return;
|
||||
if (ent->available_mrs < ent->limit) {
|
||||
if (ent->stored < ent->limit) {
|
||||
ent->fill_to_high_water = true;
|
||||
mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
|
||||
} else if (ent->fill_to_high_water &&
|
||||
ent->available_mrs + ent->pending < 2 * ent->limit) {
|
||||
ent->reserved < 2 * ent->limit) {
|
||||
/*
|
||||
* Once we start populating due to hitting a low water mark
|
||||
* continue until we pass the high water mark.
|
||||
*/
|
||||
mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
|
||||
} else if (ent->available_mrs == 2 * ent->limit) {
|
||||
} else if (ent->stored == 2 * ent->limit) {
|
||||
ent->fill_to_high_water = false;
|
||||
} else if (ent->available_mrs > 2 * ent->limit) {
|
||||
} else if (ent->stored > 2 * ent->limit) {
|
||||
/* Queue deletion of excess entries */
|
||||
ent->fill_to_high_water = false;
|
||||
if (ent->pending)
|
||||
if (ent->stored != ent->reserved)
|
||||
queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
|
||||
msecs_to_jiffies(1000));
|
||||
else
|
||||
@@ -511,25 +569,24 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
|
||||
static void __cache_work_func(struct mlx5_cache_ent *ent)
|
||||
{
|
||||
struct mlx5_ib_dev *dev = ent->dev;
|
||||
struct mlx5_mr_cache *cache = &dev->cache;
|
||||
struct mlx5_mkey_cache *cache = &dev->cache;
|
||||
int err;
|
||||
|
||||
spin_lock_irq(&ent->lock);
|
||||
xa_lock_irq(&ent->mkeys);
|
||||
if (ent->disabled)
|
||||
goto out;
|
||||
|
||||
if (ent->fill_to_high_water &&
|
||||
ent->available_mrs + ent->pending < 2 * ent->limit &&
|
||||
if (ent->fill_to_high_water && ent->reserved < 2 * ent->limit &&
|
||||
!READ_ONCE(dev->fill_delay)) {
|
||||
spin_unlock_irq(&ent->lock);
|
||||
xa_unlock_irq(&ent->mkeys);
|
||||
err = add_keys(ent, 1);
|
||||
spin_lock_irq(&ent->lock);
|
||||
xa_lock_irq(&ent->mkeys);
|
||||
if (ent->disabled)
|
||||
goto out;
|
||||
if (err) {
|
||||
/*
|
||||
* EAGAIN only happens if pending is positive, so we
|
||||
* will be rescheduled from reg_mr_callback(). The only
|
||||
* EAGAIN only happens if there are pending MRs, so we
|
||||
* will be rescheduled when storing them. The only
|
||||
* failure path here is ENOMEM.
|
||||
*/
|
||||
if (err != -EAGAIN) {
|
||||
@@ -541,7 +598,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
|
||||
msecs_to_jiffies(1000));
|
||||
}
|
||||
}
|
||||
} else if (ent->available_mrs > 2 * ent->limit) {
|
||||
} else if (ent->stored > 2 * ent->limit) {
|
||||
bool need_delay;
|
||||
|
||||
/*
|
||||
@@ -556,11 +613,11 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
|
||||
* the garbage collection work to try to run in next cycle, in
|
||||
* order to free CPU resources to other tasks.
|
||||
*/
|
||||
spin_unlock_irq(&ent->lock);
|
||||
xa_unlock_irq(&ent->mkeys);
|
||||
need_delay = need_resched() || someone_adding(cache) ||
|
||||
!time_after(jiffies,
|
||||
READ_ONCE(cache->last_add) + 300 * HZ);
|
||||
spin_lock_irq(&ent->lock);
|
||||
xa_lock_irq(&ent->mkeys);
|
||||
if (ent->disabled)
|
||||
goto out;
|
||||
if (need_delay) {
|
||||
@@ -571,7 +628,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
|
||||
queue_adjust_cache_locked(ent);
|
||||
}
|
||||
out:
|
||||
spin_unlock_irq(&ent->lock);
|
||||
xa_unlock_irq(&ent->mkeys);
|
||||
}
|
||||
|
||||
static void delayed_cache_work_func(struct work_struct *work)
|
||||
@@ -587,73 +644,59 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
|
||||
int access_flags)
|
||||
{
|
||||
struct mlx5_ib_mr *mr;
|
||||
int err;
|
||||
|
||||
/* Matches access in alloc_cache_mr() */
|
||||
if (!mlx5r_umr_can_reconfig(dev, 0, access_flags))
|
||||
return ERR_PTR(-EOPNOTSUPP);
|
||||
|
||||
spin_lock_irq(&ent->lock);
|
||||
if (list_empty(&ent->head)) {
|
||||
mr = kzalloc(sizeof(*mr), GFP_KERNEL);
|
||||
if (!mr)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
xa_lock_irq(&ent->mkeys);
|
||||
ent->in_use++;
|
||||
|
||||
if (!ent->stored) {
|
||||
queue_adjust_cache_locked(ent);
|
||||
ent->miss++;
|
||||
spin_unlock_irq(&ent->lock);
|
||||
mr = create_cache_mr(ent);
|
||||
if (IS_ERR(mr))
|
||||
return mr;
|
||||
xa_unlock_irq(&ent->mkeys);
|
||||
err = create_cache_mkey(ent, &mr->mmkey.key);
|
||||
if (err) {
|
||||
xa_lock_irq(&ent->mkeys);
|
||||
ent->in_use--;
|
||||
xa_unlock_irq(&ent->mkeys);
|
||||
kfree(mr);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
} else {
|
||||
mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
|
||||
list_del(&mr->list);
|
||||
ent->available_mrs--;
|
||||
mr->mmkey.key = pop_stored_mkey(ent);
|
||||
queue_adjust_cache_locked(ent);
|
||||
spin_unlock_irq(&ent->lock);
|
||||
|
||||
mlx5_clear_mr(mr);
|
||||
xa_unlock_irq(&ent->mkeys);
|
||||
}
|
||||
mr->mmkey.cache_ent = ent;
|
||||
mr->mmkey.type = MLX5_MKEY_MR;
|
||||
init_waitqueue_head(&mr->mmkey.wait);
|
||||
return mr;
|
||||
}
|
||||
|
||||
static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
|
||||
{
|
||||
struct mlx5_cache_ent *ent = mr->cache_ent;
|
||||
|
||||
WRITE_ONCE(dev->cache.last_add, jiffies);
|
||||
spin_lock_irq(&ent->lock);
|
||||
list_add_tail(&mr->list, &ent->head);
|
||||
ent->available_mrs++;
|
||||
queue_adjust_cache_locked(ent);
|
||||
spin_unlock_irq(&ent->lock);
|
||||
}
|
||||
|
||||
static void clean_keys(struct mlx5_ib_dev *dev, int c)
|
||||
{
|
||||
struct mlx5_mr_cache *cache = &dev->cache;
|
||||
struct mlx5_mkey_cache *cache = &dev->cache;
|
||||
struct mlx5_cache_ent *ent = &cache->ent[c];
|
||||
struct mlx5_ib_mr *tmp_mr;
|
||||
struct mlx5_ib_mr *mr;
|
||||
LIST_HEAD(del_list);
|
||||
u32 mkey;
|
||||
|
||||
cancel_delayed_work(&ent->dwork);
|
||||
while (1) {
|
||||
spin_lock_irq(&ent->lock);
|
||||
if (list_empty(&ent->head)) {
|
||||
spin_unlock_irq(&ent->lock);
|
||||
break;
|
||||
}
|
||||
mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
|
||||
list_move(&mr->list, &del_list);
|
||||
ent->available_mrs--;
|
||||
ent->total_mrs--;
|
||||
spin_unlock_irq(&ent->lock);
|
||||
mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
|
||||
}
|
||||
|
||||
list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
|
||||
list_del(&mr->list);
|
||||
kfree(mr);
|
||||
xa_lock_irq(&ent->mkeys);
|
||||
while (ent->stored) {
|
||||
mkey = pop_stored_mkey(ent);
|
||||
xa_unlock_irq(&ent->mkeys);
|
||||
mlx5_core_destroy_mkey(dev->mdev, mkey);
|
||||
xa_lock_irq(&ent->mkeys);
|
||||
}
|
||||
xa_unlock_irq(&ent->mkeys);
|
||||
}
|
||||
|
||||
static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
|
||||
static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
|
||||
{
|
||||
if (!mlx5_debugfs_root || dev->is_rep)
|
||||
return;
|
||||
@@ -662,9 +705,9 @@ static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
|
||||
dev->cache.root = NULL;
|
||||
}
|
||||
|
||||
static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
|
||||
static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
|
||||
{
|
||||
struct mlx5_mr_cache *cache = &dev->cache;
|
||||
struct mlx5_mkey_cache *cache = &dev->cache;
|
||||
struct mlx5_cache_ent *ent;
|
||||
struct dentry *dir;
|
||||
int i;
|
||||
@@ -674,13 +717,13 @@ static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
|
||||
|
||||
cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev));
|
||||
|
||||
for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
|
||||
for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
|
||||
ent = &cache->ent[i];
|
||||
sprintf(ent->name, "%d", ent->order);
|
||||
dir = debugfs_create_dir(ent->name, cache->root);
|
||||
debugfs_create_file("size", 0600, dir, ent, &size_fops);
|
||||
debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
|
||||
debugfs_create_u32("cur", 0400, dir, &ent->available_mrs);
|
||||
debugfs_create_ulong("cur", 0400, dir, &ent->stored);
|
||||
debugfs_create_u32("miss", 0600, dir, &ent->miss);
|
||||
}
|
||||
}
|
||||
@@ -692,9 +735,9 @@ static void delay_time_func(struct timer_list *t)
|
||||
WRITE_ONCE(dev->fill_delay, 0);
|
||||
}
|
||||
|
||||
int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
|
||||
int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
|
||||
{
|
||||
struct mlx5_mr_cache *cache = &dev->cache;
|
||||
struct mlx5_mkey_cache *cache = &dev->cache;
|
||||
struct mlx5_cache_ent *ent;
|
||||
int i;
|
||||
|
||||
@@ -707,22 +750,21 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
|
||||
|
||||
mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
|
||||
timer_setup(&dev->delay_timer, delay_time_func, 0);
|
||||
for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
|
||||
for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
|
||||
ent = &cache->ent[i];
|
||||
INIT_LIST_HEAD(&ent->head);
|
||||
spin_lock_init(&ent->lock);
|
||||
xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
|
||||
ent->order = i + 2;
|
||||
ent->dev = dev;
|
||||
ent->limit = 0;
|
||||
|
||||
INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
|
||||
|
||||
if (i > MR_CACHE_LAST_STD_ENTRY) {
|
||||
mlx5_odp_init_mr_cache_entry(ent);
|
||||
if (i > MKEY_CACHE_LAST_STD_ENTRY) {
|
||||
mlx5_odp_init_mkey_cache_entry(ent);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ent->order > mr_cache_max_order(dev))
|
||||
if (ent->order > mkey_cache_max_order(dev))
|
||||
continue;
|
||||
|
||||
ent->page = PAGE_SHIFT;
|
||||
@@ -734,36 +776,36 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
|
||||
ent->limit = dev->mdev->profile.mr_cache[i].limit;
|
||||
else
|
||||
ent->limit = 0;
|
||||
spin_lock_irq(&ent->lock);
|
||||
xa_lock_irq(&ent->mkeys);
|
||||
queue_adjust_cache_locked(ent);
|
||||
spin_unlock_irq(&ent->lock);
|
||||
xa_unlock_irq(&ent->mkeys);
|
||||
}
|
||||
|
||||
mlx5_mr_cache_debugfs_init(dev);
|
||||
mlx5_mkey_cache_debugfs_init(dev);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
|
||||
int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
if (!dev->cache.wq)
|
||||
return 0;
|
||||
|
||||
for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
|
||||
for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
|
||||
struct mlx5_cache_ent *ent = &dev->cache.ent[i];
|
||||
|
||||
spin_lock_irq(&ent->lock);
|
||||
xa_lock_irq(&ent->mkeys);
|
||||
ent->disabled = true;
|
||||
spin_unlock_irq(&ent->lock);
|
||||
xa_unlock_irq(&ent->mkeys);
|
||||
cancel_delayed_work_sync(&ent->dwork);
|
||||
}
|
||||
|
||||
mlx5_mr_cache_debugfs_cleanup(dev);
|
||||
mlx5_mkey_cache_debugfs_cleanup(dev);
|
||||
mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
|
||||
|
||||
for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
|
||||
for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++)
|
||||
clean_keys(dev, i);
|
||||
|
||||
destroy_workqueue(dev->cache.wq);
|
||||
@@ -830,22 +872,22 @@ static int get_octo_len(u64 addr, u64 len, int page_shift)
|
||||
return (npages + 1) / 2;
|
||||
}
|
||||
|
||||
static int mr_cache_max_order(struct mlx5_ib_dev *dev)
|
||||
static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
|
||||
{
|
||||
if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
|
||||
return MR_CACHE_LAST_STD_ENTRY + 2;
|
||||
return MKEY_CACHE_LAST_STD_ENTRY + 2;
|
||||
return MLX5_MAX_UMR_SHIFT;
|
||||
}
|
||||
|
||||
static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev,
|
||||
unsigned int order)
|
||||
static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev,
|
||||
unsigned int order)
|
||||
{
|
||||
struct mlx5_mr_cache *cache = &dev->cache;
|
||||
struct mlx5_mkey_cache *cache = &dev->cache;
|
||||
|
||||
if (order < cache->ent[0].order)
|
||||
return &cache->ent[0];
|
||||
order = order - cache->ent[0].order;
|
||||
if (order > MR_CACHE_LAST_STD_ENTRY)
|
||||
if (order > MKEY_CACHE_LAST_STD_ENTRY)
|
||||
return NULL;
|
||||
return &cache->ent[order];
|
||||
}
|
||||
@@ -888,7 +930,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
|
||||
0, iova);
|
||||
if (WARN_ON(!page_size))
|
||||
return ERR_PTR(-EINVAL);
|
||||
ent = mr_cache_ent_from_order(
|
||||
ent = mkey_cache_ent_from_order(
|
||||
dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size)));
|
||||
/*
|
||||
* Matches access in alloc_cache_mr(). If the MR can't come from the
|
||||
@@ -1320,7 +1362,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
|
||||
struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
|
||||
|
||||
/* We only track the allocated sizes of MRs from the cache */
|
||||
if (!mr->cache_ent)
|
||||
if (!mr->mmkey.cache_ent)
|
||||
return false;
|
||||
if (!mlx5r_umr_can_load_pas(dev, new_umem->length))
|
||||
return false;
|
||||
@@ -1329,7 +1371,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
|
||||
mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova);
|
||||
if (WARN_ON(!*page_size))
|
||||
return false;
|
||||
return (1ULL << mr->cache_ent->order) >=
|
||||
return (1ULL << mr->mmkey.cache_ent->order) >=
|
||||
ib_umem_num_dma_blocks(new_umem, *page_size);
|
||||
}
|
||||
|
||||
@@ -1570,15 +1612,17 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
|
||||
}
|
||||
|
||||
/* Stop DMA */
|
||||
if (mr->cache_ent) {
|
||||
if (mlx5r_umr_revoke_mr(mr)) {
|
||||
spin_lock_irq(&mr->cache_ent->lock);
|
||||
mr->cache_ent->total_mrs--;
|
||||
spin_unlock_irq(&mr->cache_ent->lock);
|
||||
mr->cache_ent = NULL;
|
||||
}
|
||||
if (mr->mmkey.cache_ent) {
|
||||
xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
|
||||
mr->mmkey.cache_ent->in_use--;
|
||||
xa_unlock_irq(&mr->mmkey.cache_ent->mkeys);
|
||||
|
||||
if (mlx5r_umr_revoke_mr(mr) ||
|
||||
push_mkey(mr->mmkey.cache_ent, false,
|
||||
xa_mk_value(mr->mmkey.key)))
|
||||
mr->mmkey.cache_ent = NULL;
|
||||
}
|
||||
if (!mr->cache_ent) {
|
||||
if (!mr->mmkey.cache_ent) {
|
||||
rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
|
||||
if (rc)
|
||||
return rc;
|
||||
@@ -1595,12 +1639,10 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
|
||||
mlx5_ib_free_odp_mr(mr);
|
||||
}
|
||||
|
||||
if (mr->cache_ent) {
|
||||
mlx5_mr_cache_free(dev, mr);
|
||||
} else {
|
||||
if (!mr->mmkey.cache_ent)
|
||||
mlx5_free_priv_descs(mr);
|
||||
kfree(mr);
|
||||
}
|
||||
|
||||
kfree(mr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -1588,7 +1588,7 @@ mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
|
||||
return err;
|
||||
}
|
||||
|
||||
void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
|
||||
void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent)
|
||||
{
|
||||
if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
|
||||
return;
|
||||
|
||||
@@ -176,6 +176,7 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev)
|
||||
dev->umrc.pd = pd;
|
||||
|
||||
sema_init(&dev->umrc.sem, MAX_UMR_WR);
|
||||
mutex_init(&dev->umrc.lock);
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -195,6 +196,31 @@ void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev)
|
||||
ib_dealloc_pd(dev->umrc.pd);
|
||||
}
|
||||
|
||||
static int mlx5r_umr_recover(struct mlx5_ib_dev *dev)
|
||||
{
|
||||
struct umr_common *umrc = &dev->umrc;
|
||||
struct ib_qp_attr attr;
|
||||
int err;
|
||||
|
||||
attr.qp_state = IB_QPS_RESET;
|
||||
err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
|
||||
if (err) {
|
||||
mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
|
||||
goto err;
|
||||
}
|
||||
|
||||
err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
|
||||
if (err)
|
||||
goto err;
|
||||
|
||||
umrc->state = MLX5_UMR_STATE_ACTIVE;
|
||||
return 0;
|
||||
|
||||
err:
|
||||
umrc->state = MLX5_UMR_STATE_ERR;
|
||||
return err;
|
||||
}
|
||||
|
||||
static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
|
||||
struct mlx5r_umr_wqe *wqe, bool with_data)
|
||||
{
|
||||
@@ -231,7 +257,7 @@ static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
|
||||
|
||||
id.ib_cqe = cqe;
|
||||
mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0,
|
||||
MLX5_FENCE_MODE_NONE, MLX5_OPCODE_UMR);
|
||||
MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR);
|
||||
|
||||
mlx5r_ring_db(qp, 1, ctrl);
|
||||
|
||||
@@ -270,17 +296,49 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
|
||||
mlx5r_umr_init_context(&umr_context);
|
||||
|
||||
down(&umrc->sem);
|
||||
err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
|
||||
with_data);
|
||||
if (err)
|
||||
mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err);
|
||||
else {
|
||||
wait_for_completion(&umr_context.done);
|
||||
if (umr_context.status != IB_WC_SUCCESS) {
|
||||
mlx5_ib_warn(dev, "reg umr failed (%u)\n",
|
||||
umr_context.status);
|
||||
while (true) {
|
||||
mutex_lock(&umrc->lock);
|
||||
if (umrc->state == MLX5_UMR_STATE_ERR) {
|
||||
mutex_unlock(&umrc->lock);
|
||||
err = -EFAULT;
|
||||
break;
|
||||
}
|
||||
|
||||
if (umrc->state == MLX5_UMR_STATE_RECOVER) {
|
||||
mutex_unlock(&umrc->lock);
|
||||
usleep_range(3000, 5000);
|
||||
continue;
|
||||
}
|
||||
|
||||
err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
|
||||
with_data);
|
||||
mutex_unlock(&umrc->lock);
|
||||
if (err) {
|
||||
mlx5_ib_warn(dev, "UMR post send failed, err %d\n",
|
||||
err);
|
||||
break;
|
||||
}
|
||||
|
||||
wait_for_completion(&umr_context.done);
|
||||
|
||||
if (umr_context.status == IB_WC_SUCCESS)
|
||||
break;
|
||||
|
||||
if (umr_context.status == IB_WC_WR_FLUSH_ERR)
|
||||
continue;
|
||||
|
||||
WARN_ON_ONCE(1);
|
||||
mlx5_ib_warn(dev,
|
||||
"reg umr failed (%u). Trying to recover and resubmit the flushed WQEs\n",
|
||||
umr_context.status);
|
||||
mutex_lock(&umrc->lock);
|
||||
err = mlx5r_umr_recover(dev);
|
||||
mutex_unlock(&umrc->lock);
|
||||
if (err)
|
||||
mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n",
|
||||
err);
|
||||
err = -EFAULT;
|
||||
break;
|
||||
}
|
||||
up(&umrc->sem);
|
||||
return err;
|
||||
|
||||
@@ -3084,7 +3084,7 @@ static struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd,
|
||||
else
|
||||
DP_ERR(dev, "roce alloc tid returned error %d\n", rc);
|
||||
|
||||
goto err0;
|
||||
goto err1;
|
||||
}
|
||||
|
||||
/* Index only, 18 bit long, lkey = itid << 8 | key */
|
||||
@@ -3108,7 +3108,7 @@ static struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd,
|
||||
rc = dev->ops->rdma_register_tid(dev->rdma_ctx, &mr->hw_mr);
|
||||
if (rc) {
|
||||
DP_ERR(dev, "roce register tid returned an error %d\n", rc);
|
||||
goto err1;
|
||||
goto err2;
|
||||
}
|
||||
|
||||
mr->ibmr.lkey = mr->hw_mr.itid << 8 | mr->hw_mr.key;
|
||||
@@ -3117,8 +3117,10 @@ static struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd,
|
||||
DP_DEBUG(dev, QEDR_MSG_MR, "alloc frmr: %x\n", mr->ibmr.lkey);
|
||||
return mr;
|
||||
|
||||
err1:
|
||||
err2:
|
||||
dev->ops->rdma_free_tid(dev->rdma_ctx, mr->hw_mr.itid);
|
||||
err1:
|
||||
qedr_free_pbl(dev, &mr->info.pbl_info, mr->info.pbl_table);
|
||||
err0:
|
||||
kfree(mr);
|
||||
return ERR_PTR(rc);
|
||||
|
||||
@@ -321,7 +321,7 @@ struct qib_verbs_txreq {
|
||||
* These 7 values (SDR, DDR, and QDR may be ORed for auto-speed
|
||||
* negotiation) are used for the 3rd argument to path_f_set_ib_cfg
|
||||
* with cmd QIB_IB_CFG_SPD_ENB, by direct calls or via sysfs. They
|
||||
* are also the the possible values for qib_link_speed_enabled and active
|
||||
* are also the possible values for qib_link_speed_enabled and active
|
||||
* The values were chosen to match values used within the IB spec.
|
||||
*/
|
||||
#define QIB_IB_SDR 1
|
||||
|
||||
@@ -153,7 +153,7 @@ static int qib_get_base_info(struct file *fp, void __user *ubase,
|
||||
kinfo->spi_tidcnt += dd->rcvtidcnt % subctxt_cnt;
|
||||
/*
|
||||
* for this use, may be cfgctxts summed over all chips that
|
||||
* are are configured and present
|
||||
* are configured and present
|
||||
*/
|
||||
kinfo->spi_nctxts = dd->cfgctxts;
|
||||
/* unit (chip/board) our context is on */
|
||||
@@ -851,7 +851,7 @@ static int mmap_rcvegrbufs(struct vm_area_struct *vma,
|
||||
ret = -EPERM;
|
||||
goto bail;
|
||||
}
|
||||
/* don't allow them to later change to writeable with mprotect */
|
||||
/* don't allow them to later change to writable with mprotect */
|
||||
vma->vm_flags &= ~VM_MAYWRITE;
|
||||
|
||||
start = vma->vm_start;
|
||||
@@ -941,7 +941,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
|
||||
goto bail;
|
||||
}
|
||||
/*
|
||||
* Don't allow permission to later change to writeable
|
||||
* Don't allow permission to later change to writable
|
||||
* with mprotect.
|
||||
*/
|
||||
vma->vm_flags &= ~VM_MAYWRITE;
|
||||
|
||||
@@ -58,7 +58,7 @@ static void qib_set_ib_7220_lstate(struct qib_pportdata *, u16, u16);
|
||||
/*
|
||||
* This file contains almost all the chip-specific register information and
|
||||
* access functions for the QLogic QLogic_IB 7220 PCI-Express chip, with the
|
||||
* exception of SerDes support, which in in qib_sd7220.c.
|
||||
* exception of SerDes support, which in qib_sd7220.c.
|
||||
*/
|
||||
|
||||
/* Below uses machine-generated qib_chipnum_regs.h file */
|
||||
|
||||
@@ -2850,9 +2850,9 @@ static void qib_setup_7322_cleanup(struct qib_devdata *dd)
|
||||
|
||||
qib_7322_free_irq(dd);
|
||||
kfree(dd->cspec->cntrs);
|
||||
kfree(dd->cspec->sendchkenable);
|
||||
kfree(dd->cspec->sendgrhchk);
|
||||
kfree(dd->cspec->sendibchk);
|
||||
bitmap_free(dd->cspec->sendchkenable);
|
||||
bitmap_free(dd->cspec->sendgrhchk);
|
||||
bitmap_free(dd->cspec->sendibchk);
|
||||
kfree(dd->cspec->msix_entries);
|
||||
for (i = 0; i < dd->num_pports; i++) {
|
||||
unsigned long flags;
|
||||
@@ -6383,18 +6383,11 @@ static int qib_init_7322_variables(struct qib_devdata *dd)
|
||||
features = qib_7322_boardname(dd);
|
||||
|
||||
/* now that piobcnt2k and 4k set, we can allocate these */
|
||||
sbufcnt = dd->piobcnt2k + dd->piobcnt4k +
|
||||
NUM_VL15_BUFS + BITS_PER_LONG - 1;
|
||||
sbufcnt /= BITS_PER_LONG;
|
||||
dd->cspec->sendchkenable =
|
||||
kmalloc_array(sbufcnt, sizeof(*dd->cspec->sendchkenable),
|
||||
GFP_KERNEL);
|
||||
dd->cspec->sendgrhchk =
|
||||
kmalloc_array(sbufcnt, sizeof(*dd->cspec->sendgrhchk),
|
||||
GFP_KERNEL);
|
||||
dd->cspec->sendibchk =
|
||||
kmalloc_array(sbufcnt, sizeof(*dd->cspec->sendibchk),
|
||||
GFP_KERNEL);
|
||||
sbufcnt = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS;
|
||||
|
||||
dd->cspec->sendchkenable = bitmap_zalloc(sbufcnt, GFP_KERNEL);
|
||||
dd->cspec->sendgrhchk = bitmap_zalloc(sbufcnt, GFP_KERNEL);
|
||||
dd->cspec->sendibchk = bitmap_zalloc(sbufcnt, GFP_KERNEL);
|
||||
if (!dd->cspec->sendchkenable || !dd->cspec->sendgrhchk ||
|
||||
!dd->cspec->sendibchk) {
|
||||
ret = -ENOMEM;
|
||||
|
||||
@@ -1106,8 +1106,7 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra)
|
||||
if (!qib_cpulist_count) {
|
||||
u32 count = num_online_cpus();
|
||||
|
||||
qib_cpulist = kcalloc(BITS_TO_LONGS(count), sizeof(long),
|
||||
GFP_KERNEL);
|
||||
qib_cpulist = bitmap_zalloc(count, GFP_KERNEL);
|
||||
if (qib_cpulist)
|
||||
qib_cpulist_count = count;
|
||||
}
|
||||
@@ -1279,7 +1278,7 @@ static void __exit qib_ib_cleanup(void)
|
||||
#endif
|
||||
|
||||
qib_cpulist_count = 0;
|
||||
kfree(qib_cpulist);
|
||||
bitmap_free(qib_cpulist);
|
||||
|
||||
WARN_ON(!xa_empty(&qib_dev_table));
|
||||
qib_dev_cleanup();
|
||||
|
||||
@@ -587,7 +587,7 @@ static int epb_access(struct qib_devdata *dd, int sdnum, int claim)
|
||||
/* Need to release */
|
||||
u64 pollval;
|
||||
/*
|
||||
* The only writeable bits are the request and CS.
|
||||
* The only writable bits are the request and CS.
|
||||
* Both should be clear
|
||||
*/
|
||||
u64 newval = 0;
|
||||
|
||||
@@ -482,7 +482,7 @@ int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev)
|
||||
if (err)
|
||||
goto out_free_dev;
|
||||
|
||||
if (!iommu_capable(dev->bus, IOMMU_CAP_CACHE_COHERENCY)) {
|
||||
if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) {
|
||||
usnic_err("IOMMU of %s does not support cache coherency\n",
|
||||
dev_name(dev));
|
||||
err = -EINVAL;
|
||||
|
||||
@@ -114,6 +114,8 @@ void retransmit_timer(struct timer_list *t)
|
||||
{
|
||||
struct rxe_qp *qp = from_timer(qp, t, retrans_timer);
|
||||
|
||||
pr_debug("%s: fired for qp#%d\n", __func__, qp->elem.index);
|
||||
|
||||
if (qp->valid) {
|
||||
qp->comp.timeout = 1;
|
||||
rxe_run_task(&qp->comp.task, 1);
|
||||
@@ -560,17 +562,16 @@ int rxe_completer(void *arg)
|
||||
struct sk_buff *skb = NULL;
|
||||
struct rxe_pkt_info *pkt = NULL;
|
||||
enum comp_state state;
|
||||
int ret = 0;
|
||||
int ret;
|
||||
|
||||
if (!rxe_get(qp))
|
||||
return -EAGAIN;
|
||||
|
||||
if (!qp->valid || qp->req.state == QP_STATE_ERROR ||
|
||||
qp->req.state == QP_STATE_RESET) {
|
||||
if (!qp->valid || qp->comp.state == QP_STATE_ERROR ||
|
||||
qp->comp.state == QP_STATE_RESET) {
|
||||
rxe_drain_resp_pkts(qp, qp->valid &&
|
||||
qp->req.state == QP_STATE_ERROR);
|
||||
ret = -EAGAIN;
|
||||
goto done;
|
||||
qp->comp.state == QP_STATE_ERROR);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (qp->comp.timeout) {
|
||||
@@ -580,10 +581,8 @@ int rxe_completer(void *arg)
|
||||
qp->comp.timeout_retry = 0;
|
||||
}
|
||||
|
||||
if (qp->req.need_retry) {
|
||||
ret = -EAGAIN;
|
||||
goto done;
|
||||
}
|
||||
if (qp->req.need_retry)
|
||||
goto exit;
|
||||
|
||||
state = COMPST_GET_ACK;
|
||||
|
||||
@@ -676,8 +675,7 @@ int rxe_completer(void *arg)
|
||||
qp->qp_timeout_jiffies)
|
||||
mod_timer(&qp->retrans_timer,
|
||||
jiffies + qp->qp_timeout_jiffies);
|
||||
ret = -EAGAIN;
|
||||
goto done;
|
||||
goto exit;
|
||||
|
||||
case COMPST_ERROR_RETRY:
|
||||
/* we come here if the retry timer fired and we did
|
||||
@@ -689,10 +687,8 @@ int rxe_completer(void *arg)
|
||||
*/
|
||||
|
||||
/* there is nothing to retry in this case */
|
||||
if (!wqe || (wqe->state == wqe_state_posted)) {
|
||||
ret = -EAGAIN;
|
||||
goto done;
|
||||
}
|
||||
if (!wqe || (wqe->state == wqe_state_posted))
|
||||
goto exit;
|
||||
|
||||
/* if we've started a retry, don't start another
|
||||
* retry sequence, unless this is a timeout.
|
||||
@@ -730,18 +726,21 @@ int rxe_completer(void *arg)
|
||||
break;
|
||||
|
||||
case COMPST_RNR_RETRY:
|
||||
/* we come here if we received an RNR NAK */
|
||||
if (qp->comp.rnr_retry > 0) {
|
||||
if (qp->comp.rnr_retry != 7)
|
||||
qp->comp.rnr_retry--;
|
||||
|
||||
qp->req.need_retry = 1;
|
||||
/* don't start a retry flow until the
|
||||
* rnr timer has fired
|
||||
*/
|
||||
qp->req.wait_for_rnr_timer = 1;
|
||||
pr_debug("qp#%d set rnr nak timer\n",
|
||||
qp_num(qp));
|
||||
mod_timer(&qp->rnr_nak_timer,
|
||||
jiffies + rnrnak_jiffies(aeth_syn(pkt)
|
||||
& ~AETH_TYPE_MASK));
|
||||
ret = -EAGAIN;
|
||||
goto done;
|
||||
goto exit;
|
||||
} else {
|
||||
rxe_counter_inc(rxe,
|
||||
RXE_CNT_RNR_RETRY_EXCEEDED);
|
||||
@@ -754,12 +753,20 @@ int rxe_completer(void *arg)
|
||||
WARN_ON_ONCE(wqe->status == IB_WC_SUCCESS);
|
||||
do_complete(qp, wqe);
|
||||
rxe_qp_error(qp);
|
||||
ret = -EAGAIN;
|
||||
goto done;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
/* A non-zero return value will cause rxe_do_task to
|
||||
* exit its loop and end the tasklet. A zero return
|
||||
* will continue looping and return to rxe_completer
|
||||
*/
|
||||
done:
|
||||
ret = 0;
|
||||
goto out;
|
||||
exit:
|
||||
ret = -EAGAIN;
|
||||
out:
|
||||
if (pkt)
|
||||
free_pkt(pkt);
|
||||
rxe_put(qp);
|
||||
|
||||
@@ -19,16 +19,16 @@ int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq,
|
||||
}
|
||||
|
||||
if (cqe > rxe->attr.max_cqe) {
|
||||
pr_warn("cqe(%d) > max_cqe(%d)\n",
|
||||
cqe, rxe->attr.max_cqe);
|
||||
pr_debug("cqe(%d) > max_cqe(%d)\n",
|
||||
cqe, rxe->attr.max_cqe);
|
||||
goto err1;
|
||||
}
|
||||
|
||||
if (cq) {
|
||||
count = queue_count(cq->queue, QUEUE_TYPE_TO_CLIENT);
|
||||
if (cqe < count) {
|
||||
pr_warn("cqe(%d) < current # elements in queue (%d)",
|
||||
cqe, count);
|
||||
pr_debug("cqe(%d) < current # elements in queue (%d)",
|
||||
cqe, count);
|
||||
goto err1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -77,9 +77,8 @@ struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key,
|
||||
enum rxe_mr_lookup_type type);
|
||||
int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length);
|
||||
int advance_dma_data(struct rxe_dma_info *dma, unsigned int length);
|
||||
int rxe_invalidate_mr(struct rxe_qp *qp, u32 rkey);
|
||||
int rxe_invalidate_mr(struct rxe_qp *qp, u32 key);
|
||||
int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe);
|
||||
int rxe_mr_set_page(struct ib_mr *ibmr, u64 addr);
|
||||
int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
|
||||
void rxe_mr_cleanup(struct rxe_pool_elem *elem);
|
||||
|
||||
@@ -145,7 +144,7 @@ static inline int rcv_wqe_size(int max_sge)
|
||||
max_sge * sizeof(struct ib_sge);
|
||||
}
|
||||
|
||||
void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res);
|
||||
void free_rd_atomic_resource(struct resp_res *res);
|
||||
|
||||
static inline void rxe_advance_resp_resource(struct rxe_qp *qp)
|
||||
{
|
||||
|
||||
@@ -24,7 +24,7 @@ u8 rxe_get_next_key(u32 last_key)
|
||||
|
||||
int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length)
|
||||
{
|
||||
struct rxe_map_set *set = mr->cur_map_set;
|
||||
|
||||
|
||||
switch (mr->type) {
|
||||
case IB_MR_TYPE_DMA:
|
||||
@@ -32,8 +32,8 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length)
|
||||
|
||||
case IB_MR_TYPE_USER:
|
||||
case IB_MR_TYPE_MEM_REG:
|
||||
if (iova < set->iova || length > set->length ||
|
||||
iova > set->iova + set->length - length)
|
||||
if (iova < mr->iova || length > mr->length ||
|
||||
iova > mr->iova + mr->length - length)
|
||||
return -EFAULT;
|
||||
return 0;
|
||||
|
||||
@@ -65,89 +65,41 @@ static void rxe_mr_init(int access, struct rxe_mr *mr)
|
||||
mr->map_shift = ilog2(RXE_BUF_PER_MAP);
|
||||
}
|
||||
|
||||
static void rxe_mr_free_map_set(int num_map, struct rxe_map_set *set)
|
||||
static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf)
|
||||
{
|
||||
int i;
|
||||
int num_map;
|
||||
struct rxe_map **map = mr->map;
|
||||
|
||||
for (i = 0; i < num_map; i++)
|
||||
kfree(set->map[i]);
|
||||
num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP;
|
||||
|
||||
kfree(set->map);
|
||||
kfree(set);
|
||||
}
|
||||
|
||||
static int rxe_mr_alloc_map_set(int num_map, struct rxe_map_set **setp)
|
||||
{
|
||||
int i;
|
||||
struct rxe_map_set *set;
|
||||
|
||||
set = kmalloc(sizeof(*set), GFP_KERNEL);
|
||||
if (!set)
|
||||
goto err_out;
|
||||
|
||||
set->map = kmalloc_array(num_map, sizeof(struct rxe_map *), GFP_KERNEL);
|
||||
if (!set->map)
|
||||
goto err_free_set;
|
||||
mr->map = kmalloc_array(num_map, sizeof(*map), GFP_KERNEL);
|
||||
if (!mr->map)
|
||||
goto err1;
|
||||
|
||||
for (i = 0; i < num_map; i++) {
|
||||
set->map[i] = kmalloc(sizeof(struct rxe_map), GFP_KERNEL);
|
||||
if (!set->map[i])
|
||||
goto err_free_map;
|
||||
mr->map[i] = kmalloc(sizeof(**map), GFP_KERNEL);
|
||||
if (!mr->map[i])
|
||||
goto err2;
|
||||
}
|
||||
|
||||
*setp = set;
|
||||
|
||||
return 0;
|
||||
|
||||
err_free_map:
|
||||
for (i--; i >= 0; i--)
|
||||
kfree(set->map[i]);
|
||||
|
||||
kfree(set->map);
|
||||
err_free_set:
|
||||
kfree(set);
|
||||
err_out:
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
/**
|
||||
* rxe_mr_alloc() - Allocate memory map array(s) for MR
|
||||
* @mr: Memory region
|
||||
* @num_buf: Number of buffer descriptors to support
|
||||
* @both: If non zero allocate both mr->map and mr->next_map
|
||||
* else just allocate mr->map. Used for fast MRs
|
||||
*
|
||||
* Return: 0 on success else an error
|
||||
*/
|
||||
static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf, int both)
|
||||
{
|
||||
int ret;
|
||||
int num_map;
|
||||
|
||||
BUILD_BUG_ON(!is_power_of_2(RXE_BUF_PER_MAP));
|
||||
num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP;
|
||||
|
||||
mr->map_shift = ilog2(RXE_BUF_PER_MAP);
|
||||
mr->map_mask = RXE_BUF_PER_MAP - 1;
|
||||
|
||||
mr->num_buf = num_buf;
|
||||
mr->max_buf = num_map * RXE_BUF_PER_MAP;
|
||||
mr->num_map = num_map;
|
||||
|
||||
ret = rxe_mr_alloc_map_set(num_map, &mr->cur_map_set);
|
||||
if (ret)
|
||||
return -ENOMEM;
|
||||
|
||||
if (both) {
|
||||
ret = rxe_mr_alloc_map_set(num_map, &mr->next_map_set);
|
||||
if (ret)
|
||||
goto err_free;
|
||||
}
|
||||
mr->max_buf = num_map * RXE_BUF_PER_MAP;
|
||||
|
||||
return 0;
|
||||
|
||||
err_free:
|
||||
rxe_mr_free_map_set(mr->num_map, mr->cur_map_set);
|
||||
mr->cur_map_set = NULL;
|
||||
err2:
|
||||
for (i--; i >= 0; i--)
|
||||
kfree(mr->map[i]);
|
||||
|
||||
kfree(mr->map);
|
||||
err1:
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
@@ -164,7 +116,6 @@ void rxe_mr_init_dma(struct rxe_pd *pd, int access, struct rxe_mr *mr)
|
||||
int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova,
|
||||
int access, struct rxe_mr *mr)
|
||||
{
|
||||
struct rxe_map_set *set;
|
||||
struct rxe_map **map;
|
||||
struct rxe_phys_buf *buf = NULL;
|
||||
struct ib_umem *umem;
|
||||
@@ -172,6 +123,7 @@ int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova,
|
||||
int num_buf;
|
||||
void *vaddr;
|
||||
int err;
|
||||
int i;
|
||||
|
||||
umem = ib_umem_get(pd->ibpd.device, start, length, access);
|
||||
if (IS_ERR(umem)) {
|
||||
@@ -185,20 +137,18 @@ int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova,
|
||||
|
||||
rxe_mr_init(access, mr);
|
||||
|
||||
err = rxe_mr_alloc(mr, num_buf, 0);
|
||||
err = rxe_mr_alloc(mr, num_buf);
|
||||
if (err) {
|
||||
pr_warn("%s: Unable to allocate memory for map\n",
|
||||
__func__);
|
||||
goto err_release_umem;
|
||||
}
|
||||
|
||||
set = mr->cur_map_set;
|
||||
set->page_shift = PAGE_SHIFT;
|
||||
set->page_mask = PAGE_SIZE - 1;
|
||||
|
||||
num_buf = 0;
|
||||
map = set->map;
|
||||
mr->page_shift = PAGE_SHIFT;
|
||||
mr->page_mask = PAGE_SIZE - 1;
|
||||
|
||||
num_buf = 0;
|
||||
map = mr->map;
|
||||
if (length > 0) {
|
||||
buf = map[0]->buf;
|
||||
|
||||
@@ -214,29 +164,33 @@ int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova,
|
||||
pr_warn("%s: Unable to get virtual address\n",
|
||||
__func__);
|
||||
err = -ENOMEM;
|
||||
goto err_release_umem;
|
||||
goto err_cleanup_map;
|
||||
}
|
||||
|
||||
buf->addr = (uintptr_t)vaddr;
|
||||
buf->size = PAGE_SIZE;
|
||||
num_buf++;
|
||||
buf++;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
mr->ibmr.pd = &pd->ibpd;
|
||||
mr->umem = umem;
|
||||
mr->access = access;
|
||||
mr->length = length;
|
||||
mr->iova = iova;
|
||||
mr->va = start;
|
||||
mr->offset = ib_umem_offset(umem);
|
||||
mr->state = RXE_MR_STATE_VALID;
|
||||
mr->type = IB_MR_TYPE_USER;
|
||||
|
||||
set->length = length;
|
||||
set->iova = iova;
|
||||
set->va = start;
|
||||
set->offset = ib_umem_offset(umem);
|
||||
|
||||
return 0;
|
||||
|
||||
err_cleanup_map:
|
||||
for (i = 0; i < mr->num_map; i++)
|
||||
kfree(mr->map[i]);
|
||||
kfree(mr->map);
|
||||
err_release_umem:
|
||||
ib_umem_release(umem);
|
||||
err_out:
|
||||
@@ -250,7 +204,7 @@ int rxe_mr_init_fast(struct rxe_pd *pd, int max_pages, struct rxe_mr *mr)
|
||||
/* always allow remote access for FMRs */
|
||||
rxe_mr_init(IB_ACCESS_REMOTE, mr);
|
||||
|
||||
err = rxe_mr_alloc(mr, max_pages, 1);
|
||||
err = rxe_mr_alloc(mr, max_pages);
|
||||
if (err)
|
||||
goto err1;
|
||||
|
||||
@@ -268,24 +222,21 @@ err1:
|
||||
static void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out,
|
||||
size_t *offset_out)
|
||||
{
|
||||
struct rxe_map_set *set = mr->cur_map_set;
|
||||
size_t offset = iova - set->iova + set->offset;
|
||||
size_t offset = iova - mr->iova + mr->offset;
|
||||
int map_index;
|
||||
int buf_index;
|
||||
u64 length;
|
||||
struct rxe_map *map;
|
||||
|
||||
if (likely(set->page_shift)) {
|
||||
*offset_out = offset & set->page_mask;
|
||||
offset >>= set->page_shift;
|
||||
if (likely(mr->page_shift)) {
|
||||
*offset_out = offset & mr->page_mask;
|
||||
offset >>= mr->page_shift;
|
||||
*n_out = offset & mr->map_mask;
|
||||
*m_out = offset >> mr->map_shift;
|
||||
} else {
|
||||
map_index = 0;
|
||||
buf_index = 0;
|
||||
|
||||
map = set->map[map_index];
|
||||
length = map->buf[buf_index].size;
|
||||
length = mr->map[map_index]->buf[buf_index].size;
|
||||
|
||||
while (offset >= length) {
|
||||
offset -= length;
|
||||
@@ -295,8 +246,7 @@ static void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out,
|
||||
map_index++;
|
||||
buf_index = 0;
|
||||
}
|
||||
map = set->map[map_index];
|
||||
length = map->buf[buf_index].size;
|
||||
length = mr->map[map_index]->buf[buf_index].size;
|
||||
}
|
||||
|
||||
*m_out = map_index;
|
||||
@@ -317,7 +267,7 @@ void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!mr->cur_map_set) {
|
||||
if (!mr->map) {
|
||||
addr = (void *)(uintptr_t)iova;
|
||||
goto out;
|
||||
}
|
||||
@@ -330,13 +280,13 @@ void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length)
|
||||
|
||||
lookup_iova(mr, iova, &m, &n, &offset);
|
||||
|
||||
if (offset + length > mr->cur_map_set->map[m]->buf[n].size) {
|
||||
if (offset + length > mr->map[m]->buf[n].size) {
|
||||
pr_warn("crosses page boundary\n");
|
||||
addr = NULL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
addr = (void *)(uintptr_t)mr->cur_map_set->map[m]->buf[n].addr + offset;
|
||||
addr = (void *)(uintptr_t)mr->map[m]->buf[n].addr + offset;
|
||||
|
||||
out:
|
||||
return addr;
|
||||
@@ -372,7 +322,7 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
|
||||
return 0;
|
||||
}
|
||||
|
||||
WARN_ON_ONCE(!mr->cur_map_set);
|
||||
WARN_ON_ONCE(!mr->map);
|
||||
|
||||
err = mr_check_range(mr, iova, length);
|
||||
if (err) {
|
||||
@@ -382,7 +332,7 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
|
||||
|
||||
lookup_iova(mr, iova, &m, &i, &offset);
|
||||
|
||||
map = mr->cur_map_set->map + m;
|
||||
map = mr->map + m;
|
||||
buf = map[0]->buf + i;
|
||||
|
||||
while (length > 0) {
|
||||
@@ -576,22 +526,22 @@ struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key,
|
||||
return mr;
|
||||
}
|
||||
|
||||
int rxe_invalidate_mr(struct rxe_qp *qp, u32 rkey)
|
||||
int rxe_invalidate_mr(struct rxe_qp *qp, u32 key)
|
||||
{
|
||||
struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
|
||||
struct rxe_mr *mr;
|
||||
int ret;
|
||||
|
||||
mr = rxe_pool_get_index(&rxe->mr_pool, rkey >> 8);
|
||||
mr = rxe_pool_get_index(&rxe->mr_pool, key >> 8);
|
||||
if (!mr) {
|
||||
pr_err("%s: No MR for rkey %#x\n", __func__, rkey);
|
||||
pr_err("%s: No MR for key %#x\n", __func__, key);
|
||||
ret = -EINVAL;
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (rkey != mr->rkey) {
|
||||
pr_err("%s: rkey (%#x) doesn't match mr->rkey (%#x)\n",
|
||||
__func__, rkey, mr->rkey);
|
||||
if (mr->rkey ? (key != mr->rkey) : (key != mr->lkey)) {
|
||||
pr_err("%s: wr key (%#x) doesn't match mr key (%#x)\n",
|
||||
__func__, key, (mr->rkey ? mr->rkey : mr->lkey));
|
||||
ret = -EINVAL;
|
||||
goto err_drop_ref;
|
||||
}
|
||||
@@ -628,9 +578,8 @@ err:
|
||||
int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
|
||||
{
|
||||
struct rxe_mr *mr = to_rmr(wqe->wr.wr.reg.mr);
|
||||
u32 key = wqe->wr.wr.reg.key & 0xff;
|
||||
u32 key = wqe->wr.wr.reg.key;
|
||||
u32 access = wqe->wr.wr.reg.access;
|
||||
struct rxe_map_set *set;
|
||||
|
||||
/* user can only register MR in free state */
|
||||
if (unlikely(mr->state != RXE_MR_STATE_FREE)) {
|
||||
@@ -646,36 +595,19 @@ int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* user is only allowed to change key portion of l/rkey */
|
||||
if (unlikely((mr->lkey & ~0xff) != (key & ~0xff))) {
|
||||
pr_warn("%s: key = 0x%x has wrong index mr->lkey = 0x%x\n",
|
||||
__func__, key, mr->lkey);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
mr->access = access;
|
||||
mr->lkey = (mr->lkey & ~0xff) | key;
|
||||
mr->rkey = (access & IB_ACCESS_REMOTE) ? mr->lkey : 0;
|
||||
mr->lkey = key;
|
||||
mr->rkey = (access & IB_ACCESS_REMOTE) ? key : 0;
|
||||
mr->iova = wqe->wr.wr.reg.mr->iova;
|
||||
mr->state = RXE_MR_STATE_VALID;
|
||||
|
||||
set = mr->cur_map_set;
|
||||
mr->cur_map_set = mr->next_map_set;
|
||||
mr->cur_map_set->iova = wqe->wr.wr.reg.mr->iova;
|
||||
mr->next_map_set = set;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int rxe_mr_set_page(struct ib_mr *ibmr, u64 addr)
|
||||
{
|
||||
struct rxe_mr *mr = to_rmr(ibmr);
|
||||
struct rxe_map_set *set = mr->next_map_set;
|
||||
struct rxe_map *map;
|
||||
struct rxe_phys_buf *buf;
|
||||
|
||||
if (unlikely(set->nbuf == mr->num_buf))
|
||||
return -ENOMEM;
|
||||
|
||||
map = set->map[set->nbuf / RXE_BUF_PER_MAP];
|
||||
buf = &map->buf[set->nbuf % RXE_BUF_PER_MAP];
|
||||
|
||||
buf->addr = addr;
|
||||
buf->size = ibmr->page_size;
|
||||
set->nbuf++;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -687,7 +619,7 @@ int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
|
||||
if (atomic_read(&mr->num_mw) > 0)
|
||||
return -EINVAL;
|
||||
|
||||
rxe_put(mr);
|
||||
rxe_cleanup(mr);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -695,14 +627,15 @@ int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
|
||||
void rxe_mr_cleanup(struct rxe_pool_elem *elem)
|
||||
{
|
||||
struct rxe_mr *mr = container_of(elem, typeof(*mr), elem);
|
||||
int i;
|
||||
|
||||
rxe_put(mr_pd(mr));
|
||||
|
||||
ib_umem_release(mr->umem);
|
||||
|
||||
if (mr->cur_map_set)
|
||||
rxe_mr_free_map_set(mr->num_map, mr->cur_map_set);
|
||||
if (mr->map) {
|
||||
for (i = 0; i < mr->num_map; i++)
|
||||
kfree(mr->map[i]);
|
||||
|
||||
if (mr->next_map_set)
|
||||
rxe_mr_free_map_set(mr->num_map, mr->next_map_set);
|
||||
kfree(mr->map);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,6 +33,8 @@ int rxe_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
|
||||
RXE_MW_STATE_FREE : RXE_MW_STATE_VALID;
|
||||
spin_lock_init(&mw->lock);
|
||||
|
||||
rxe_finalize(mw);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -40,7 +42,7 @@ int rxe_dealloc_mw(struct ib_mw *ibmw)
|
||||
{
|
||||
struct rxe_mw *mw = to_rmw(ibmw);
|
||||
|
||||
rxe_put(mw);
|
||||
rxe_cleanup(mw);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -48,8 +50,6 @@ int rxe_dealloc_mw(struct ib_mw *ibmw)
|
||||
static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
|
||||
struct rxe_mw *mw, struct rxe_mr *mr)
|
||||
{
|
||||
u32 key = wqe->wr.wr.mw.rkey & 0xff;
|
||||
|
||||
if (mw->ibmw.type == IB_MW_TYPE_1) {
|
||||
if (unlikely(mw->state != RXE_MW_STATE_VALID)) {
|
||||
pr_err_once(
|
||||
@@ -87,11 +87,6 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(key == (mw->rkey & 0xff))) {
|
||||
pr_err_once("attempt to bind MW with same key\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* remaining checks only apply to a nonzero MR */
|
||||
if (!mr)
|
||||
return 0;
|
||||
@@ -113,21 +108,21 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
|
||||
(IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC)) &&
|
||||
!(mr->access & IB_ACCESS_LOCAL_WRITE))) {
|
||||
pr_err_once(
|
||||
"attempt to bind an writeable MW to an MR without local write access\n");
|
||||
"attempt to bind an Writable MW to an MR without local write access\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* C10-75 */
|
||||
if (mw->access & IB_ZERO_BASED) {
|
||||
if (unlikely(wqe->wr.wr.mw.length > mr->cur_map_set->length)) {
|
||||
if (unlikely(wqe->wr.wr.mw.length > mr->length)) {
|
||||
pr_err_once(
|
||||
"attempt to bind a ZB MW outside of the MR\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
} else {
|
||||
if (unlikely((wqe->wr.wr.mw.addr < mr->cur_map_set->iova) ||
|
||||
if (unlikely((wqe->wr.wr.mw.addr < mr->iova) ||
|
||||
((wqe->wr.wr.mw.addr + wqe->wr.wr.mw.length) >
|
||||
(mr->cur_map_set->iova + mr->cur_map_set->length)))) {
|
||||
(mr->iova + mr->length)))) {
|
||||
pr_err_once(
|
||||
"attempt to bind a VA MW outside of the MR\n");
|
||||
return -EINVAL;
|
||||
|
||||
@@ -105,6 +105,12 @@ enum rxe_device_param {
|
||||
RXE_INFLIGHT_SKBS_PER_QP_HIGH = 64,
|
||||
RXE_INFLIGHT_SKBS_PER_QP_LOW = 16,
|
||||
|
||||
/* Max number of interations of each tasklet
|
||||
* before yielding the cpu to let other
|
||||
* work make progress
|
||||
*/
|
||||
RXE_MAX_ITERATIONS = 1024,
|
||||
|
||||
/* Delay before calling arbiter timer */
|
||||
RXE_NSEC_ARB_TIMER_DELAY = 200,
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
#include "rxe.h"
|
||||
|
||||
#define RXE_POOL_TIMEOUT (200)
|
||||
#define RXE_POOL_ALIGN (16)
|
||||
|
||||
static const struct rxe_type_info {
|
||||
@@ -136,10 +137,14 @@ void *rxe_alloc(struct rxe_pool *pool)
|
||||
elem->pool = pool;
|
||||
elem->obj = obj;
|
||||
kref_init(&elem->ref_cnt);
|
||||
init_completion(&elem->complete);
|
||||
|
||||
err = xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit,
|
||||
/* allocate index in array but leave pointer as NULL so it
|
||||
* can't be looked up until rxe_finalize() is called
|
||||
*/
|
||||
err = xa_alloc_cyclic(&pool->xa, &elem->index, NULL, pool->limit,
|
||||
&pool->next, GFP_KERNEL);
|
||||
if (err)
|
||||
if (err < 0)
|
||||
goto err_free;
|
||||
|
||||
return obj;
|
||||
@@ -151,9 +156,11 @@ err_cnt:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem)
|
||||
int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem,
|
||||
bool sleepable)
|
||||
{
|
||||
int err;
|
||||
gfp_t gfp_flags;
|
||||
|
||||
if (WARN_ON(pool->type == RXE_TYPE_MR))
|
||||
return -EINVAL;
|
||||
@@ -164,10 +171,19 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem)
|
||||
elem->pool = pool;
|
||||
elem->obj = (u8 *)elem - pool->elem_offset;
|
||||
kref_init(&elem->ref_cnt);
|
||||
init_completion(&elem->complete);
|
||||
|
||||
err = xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit,
|
||||
&pool->next, GFP_KERNEL);
|
||||
if (err)
|
||||
/* AH objects are unique in that the create_ah verb
|
||||
* can be called in atomic context. If the create_ah
|
||||
* call is not sleepable use GFP_ATOMIC.
|
||||
*/
|
||||
gfp_flags = sleepable ? GFP_KERNEL : GFP_ATOMIC;
|
||||
|
||||
if (sleepable)
|
||||
might_sleep();
|
||||
err = xa_alloc_cyclic(&pool->xa, &elem->index, NULL, pool->limit,
|
||||
&pool->next, gfp_flags);
|
||||
if (err < 0)
|
||||
goto err_cnt;
|
||||
|
||||
return 0;
|
||||
@@ -181,16 +197,15 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index)
|
||||
{
|
||||
struct rxe_pool_elem *elem;
|
||||
struct xarray *xa = &pool->xa;
|
||||
unsigned long flags;
|
||||
void *obj;
|
||||
|
||||
xa_lock_irqsave(xa, flags);
|
||||
rcu_read_lock();
|
||||
elem = xa_load(xa, index);
|
||||
if (elem && kref_get_unless_zero(&elem->ref_cnt))
|
||||
obj = elem->obj;
|
||||
else
|
||||
obj = NULL;
|
||||
xa_unlock_irqrestore(xa, flags);
|
||||
rcu_read_unlock();
|
||||
|
||||
return obj;
|
||||
}
|
||||
@@ -198,17 +213,74 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index)
|
||||
static void rxe_elem_release(struct kref *kref)
|
||||
{
|
||||
struct rxe_pool_elem *elem = container_of(kref, typeof(*elem), ref_cnt);
|
||||
struct rxe_pool *pool = elem->pool;
|
||||
|
||||
xa_erase(&pool->xa, elem->index);
|
||||
complete(&elem->complete);
|
||||
}
|
||||
|
||||
int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable)
|
||||
{
|
||||
struct rxe_pool *pool = elem->pool;
|
||||
struct xarray *xa = &pool->xa;
|
||||
static int timeout = RXE_POOL_TIMEOUT;
|
||||
int ret, err = 0;
|
||||
void *xa_ret;
|
||||
|
||||
if (sleepable)
|
||||
might_sleep();
|
||||
|
||||
/* erase xarray entry to prevent looking up
|
||||
* the pool elem from its index
|
||||
*/
|
||||
xa_ret = xa_erase(xa, elem->index);
|
||||
WARN_ON(xa_err(xa_ret));
|
||||
|
||||
/* if this is the last call to rxe_put complete the
|
||||
* object. It is safe to touch obj->elem after this since
|
||||
* it is freed below
|
||||
*/
|
||||
__rxe_put(elem);
|
||||
|
||||
/* wait until all references to the object have been
|
||||
* dropped before final object specific cleanup and
|
||||
* return to rdma-core
|
||||
*/
|
||||
if (sleepable) {
|
||||
if (!completion_done(&elem->complete) && timeout) {
|
||||
ret = wait_for_completion_timeout(&elem->complete,
|
||||
timeout);
|
||||
|
||||
/* Shouldn't happen. There are still references to
|
||||
* the object but, rather than deadlock, free the
|
||||
* object or pass back to rdma-core.
|
||||
*/
|
||||
if (WARN_ON(!ret))
|
||||
err = -EINVAL;
|
||||
}
|
||||
} else {
|
||||
unsigned long until = jiffies + timeout;
|
||||
|
||||
/* AH objects are unique in that the destroy_ah verb
|
||||
* can be called in atomic context. This delay
|
||||
* replaces the wait_for_completion call above
|
||||
* when the destroy_ah call is not sleepable
|
||||
*/
|
||||
while (!completion_done(&elem->complete) &&
|
||||
time_before(jiffies, until))
|
||||
mdelay(1);
|
||||
|
||||
if (WARN_ON(!completion_done(&elem->complete)))
|
||||
err = -EINVAL;
|
||||
}
|
||||
|
||||
if (pool->cleanup)
|
||||
pool->cleanup(elem);
|
||||
|
||||
if (pool->type == RXE_TYPE_MR)
|
||||
kfree(elem->obj);
|
||||
kfree_rcu(elem->obj);
|
||||
|
||||
atomic_dec(&pool->num_elem);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int __rxe_get(struct rxe_pool_elem *elem)
|
||||
@@ -220,3 +292,11 @@ int __rxe_put(struct rxe_pool_elem *elem)
|
||||
{
|
||||
return kref_put(&elem->ref_cnt, rxe_elem_release);
|
||||
}
|
||||
|
||||
void __rxe_finalize(struct rxe_pool_elem *elem)
|
||||
{
|
||||
void *xa_ret;
|
||||
|
||||
xa_ret = xa_store(&elem->pool->xa, elem->index, elem, GFP_KERNEL);
|
||||
WARN_ON(xa_err(xa_ret));
|
||||
}
|
||||
|
||||
@@ -24,6 +24,7 @@ struct rxe_pool_elem {
|
||||
void *obj;
|
||||
struct kref ref_cnt;
|
||||
struct list_head list;
|
||||
struct completion complete;
|
||||
u32 index;
|
||||
};
|
||||
|
||||
@@ -57,21 +58,28 @@ void rxe_pool_cleanup(struct rxe_pool *pool);
|
||||
void *rxe_alloc(struct rxe_pool *pool);
|
||||
|
||||
/* connect already allocated object to pool */
|
||||
int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem);
|
||||
|
||||
#define rxe_add_to_pool(pool, obj) __rxe_add_to_pool(pool, &(obj)->elem)
|
||||
int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem,
|
||||
bool sleepable);
|
||||
#define rxe_add_to_pool(pool, obj) __rxe_add_to_pool(pool, &(obj)->elem, true)
|
||||
#define rxe_add_to_pool_ah(pool, obj, sleepable) __rxe_add_to_pool(pool, \
|
||||
&(obj)->elem, sleepable)
|
||||
|
||||
/* lookup an indexed object from index. takes a reference on object */
|
||||
void *rxe_pool_get_index(struct rxe_pool *pool, u32 index);
|
||||
|
||||
int __rxe_get(struct rxe_pool_elem *elem);
|
||||
|
||||
#define rxe_get(obj) __rxe_get(&(obj)->elem)
|
||||
|
||||
int __rxe_put(struct rxe_pool_elem *elem);
|
||||
|
||||
#define rxe_put(obj) __rxe_put(&(obj)->elem)
|
||||
|
||||
int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable);
|
||||
#define rxe_cleanup(obj) __rxe_cleanup(&(obj)->elem, true)
|
||||
#define rxe_cleanup_ah(obj, sleepable) __rxe_cleanup(&(obj)->elem, sleepable)
|
||||
|
||||
#define rxe_read(obj) kref_read(&(obj)->elem.ref_cnt)
|
||||
|
||||
void __rxe_finalize(struct rxe_pool_elem *elem);
|
||||
#define rxe_finalize(obj) __rxe_finalize(&(obj)->elem)
|
||||
|
||||
#endif /* RXE_POOL_H */
|
||||
|
||||
@@ -120,17 +120,15 @@ static void free_rd_atomic_resources(struct rxe_qp *qp)
|
||||
for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) {
|
||||
struct resp_res *res = &qp->resp.resources[i];
|
||||
|
||||
free_rd_atomic_resource(qp, res);
|
||||
free_rd_atomic_resource(res);
|
||||
}
|
||||
kfree(qp->resp.resources);
|
||||
qp->resp.resources = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res)
|
||||
void free_rd_atomic_resource(struct resp_res *res)
|
||||
{
|
||||
if (res->type == RXE_ATOMIC_MASK)
|
||||
kfree_skb(res->atomic.skb);
|
||||
res->type = 0;
|
||||
}
|
||||
|
||||
@@ -142,7 +140,7 @@ static void cleanup_rd_atomic_resources(struct rxe_qp *qp)
|
||||
if (qp->resp.resources) {
|
||||
for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) {
|
||||
res = &qp->resp.resources[i];
|
||||
free_rd_atomic_resource(qp, res);
|
||||
free_rd_atomic_resource(res);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -174,6 +172,14 @@ static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp,
|
||||
|
||||
spin_lock_init(&qp->state_lock);
|
||||
|
||||
spin_lock_init(&qp->req.task.state_lock);
|
||||
spin_lock_init(&qp->resp.task.state_lock);
|
||||
spin_lock_init(&qp->comp.task.state_lock);
|
||||
|
||||
spin_lock_init(&qp->sq.sq_lock);
|
||||
spin_lock_init(&qp->rq.producer_lock);
|
||||
spin_lock_init(&qp->rq.consumer_lock);
|
||||
|
||||
atomic_set(&qp->ssn, 0);
|
||||
atomic_set(&qp->skb_out, 0);
|
||||
}
|
||||
@@ -230,10 +236,10 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
|
||||
QUEUE_TYPE_FROM_CLIENT);
|
||||
|
||||
qp->req.state = QP_STATE_RESET;
|
||||
qp->comp.state = QP_STATE_RESET;
|
||||
qp->req.opcode = -1;
|
||||
qp->comp.opcode = -1;
|
||||
|
||||
spin_lock_init(&qp->sq.sq_lock);
|
||||
skb_queue_head_init(&qp->req_pkts);
|
||||
|
||||
rxe_init_task(rxe, &qp->req.task, qp,
|
||||
@@ -284,9 +290,6 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp,
|
||||
}
|
||||
}
|
||||
|
||||
spin_lock_init(&qp->rq.producer_lock);
|
||||
spin_lock_init(&qp->rq.consumer_lock);
|
||||
|
||||
skb_queue_head_init(&qp->resp_pkts);
|
||||
|
||||
rxe_init_task(rxe, &qp->resp.task, qp,
|
||||
@@ -490,6 +493,7 @@ static void rxe_qp_reset(struct rxe_qp *qp)
|
||||
|
||||
/* move qp to the reset state */
|
||||
qp->req.state = QP_STATE_RESET;
|
||||
qp->comp.state = QP_STATE_RESET;
|
||||
qp->resp.state = QP_STATE_RESET;
|
||||
|
||||
/* let state machines reset themselves drain work and packet queues
|
||||
@@ -507,6 +511,7 @@ static void rxe_qp_reset(struct rxe_qp *qp)
|
||||
atomic_set(&qp->ssn, 0);
|
||||
qp->req.opcode = -1;
|
||||
qp->req.need_retry = 0;
|
||||
qp->req.wait_for_rnr_timer = 0;
|
||||
qp->req.noack_pkts = 0;
|
||||
qp->resp.msn = 0;
|
||||
qp->resp.opcode = -1;
|
||||
@@ -552,6 +557,7 @@ void rxe_qp_error(struct rxe_qp *qp)
|
||||
{
|
||||
qp->req.state = QP_STATE_ERROR;
|
||||
qp->resp.state = QP_STATE_ERROR;
|
||||
qp->comp.state = QP_STATE_ERROR;
|
||||
qp->attr.qp_state = IB_QPS_ERR;
|
||||
|
||||
/* drain work and packet queues */
|
||||
@@ -689,6 +695,7 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask,
|
||||
pr_debug("qp#%d state -> INIT\n", qp_num(qp));
|
||||
qp->req.state = QP_STATE_INIT;
|
||||
qp->resp.state = QP_STATE_INIT;
|
||||
qp->comp.state = QP_STATE_INIT;
|
||||
break;
|
||||
|
||||
case IB_QPS_RTR:
|
||||
@@ -699,6 +706,7 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask,
|
||||
case IB_QPS_RTS:
|
||||
pr_debug("qp#%d state -> RTS\n", qp_num(qp));
|
||||
qp->req.state = QP_STATE_READY;
|
||||
qp->comp.state = QP_STATE_READY;
|
||||
break;
|
||||
|
||||
case IB_QPS_SQD:
|
||||
@@ -804,13 +812,15 @@ static void rxe_qp_do_cleanup(struct work_struct *work)
|
||||
if (qp->rq.queue)
|
||||
rxe_queue_cleanup(qp->rq.queue);
|
||||
|
||||
atomic_dec(&qp->scq->num_wq);
|
||||
if (qp->scq)
|
||||
if (qp->scq) {
|
||||
atomic_dec(&qp->scq->num_wq);
|
||||
rxe_put(qp->scq);
|
||||
}
|
||||
|
||||
atomic_dec(&qp->rcq->num_wq);
|
||||
if (qp->rcq)
|
||||
if (qp->rcq) {
|
||||
atomic_dec(&qp->rcq->num_wq);
|
||||
rxe_put(qp->rcq);
|
||||
}
|
||||
|
||||
if (qp->pd)
|
||||
rxe_put(qp->pd);
|
||||
|
||||
@@ -7,9 +7,6 @@
|
||||
#ifndef RXE_QUEUE_H
|
||||
#define RXE_QUEUE_H
|
||||
|
||||
/* for definition of shared struct rxe_queue_buf */
|
||||
#include <uapi/rdma/rdma_user_rxe.h>
|
||||
|
||||
/* Implements a simple circular buffer that is shared between user
|
||||
* and the driver and can be resized. The requested element size is
|
||||
* rounded up to a power of 2 and the number of elements in the buffer
|
||||
@@ -53,6 +50,8 @@ enum queue_type {
|
||||
QUEUE_TYPE_FROM_DRIVER,
|
||||
};
|
||||
|
||||
struct rxe_queue_buf;
|
||||
|
||||
struct rxe_queue {
|
||||
struct rxe_dev *rxe;
|
||||
struct rxe_queue_buf *buf;
|
||||
|
||||
@@ -15,8 +15,7 @@ static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
|
||||
u32 opcode);
|
||||
|
||||
static inline void retry_first_write_send(struct rxe_qp *qp,
|
||||
struct rxe_send_wqe *wqe,
|
||||
unsigned int mask, int npsn)
|
||||
struct rxe_send_wqe *wqe, int npsn)
|
||||
{
|
||||
int i;
|
||||
|
||||
@@ -83,7 +82,7 @@ static void req_retry(struct rxe_qp *qp)
|
||||
if (mask & WR_WRITE_OR_SEND_MASK) {
|
||||
npsn = (qp->comp.psn - wqe->first_psn) &
|
||||
BTH_PSN_MASK;
|
||||
retry_first_write_send(qp, wqe, mask, npsn);
|
||||
retry_first_write_send(qp, wqe, npsn);
|
||||
}
|
||||
|
||||
if (mask & WR_READ_MASK) {
|
||||
@@ -101,7 +100,11 @@ void rnr_nak_timer(struct timer_list *t)
|
||||
{
|
||||
struct rxe_qp *qp = from_timer(qp, t, rnr_nak_timer);
|
||||
|
||||
pr_debug("qp#%d rnr nak timer fired\n", qp_num(qp));
|
||||
pr_debug("%s: fired for qp#%d\n", __func__, qp_num(qp));
|
||||
|
||||
/* request a send queue retry */
|
||||
qp->req.need_retry = 1;
|
||||
qp->req.wait_for_rnr_timer = 0;
|
||||
rxe_run_task(&qp->req.task, 1);
|
||||
}
|
||||
|
||||
@@ -161,16 +164,36 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp)
|
||||
(wqe->state != wqe_state_processing)))
|
||||
return NULL;
|
||||
|
||||
if (unlikely((wqe->wr.send_flags & IB_SEND_FENCE) &&
|
||||
(index != cons))) {
|
||||
qp->req.wait_fence = 1;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp);
|
||||
return wqe;
|
||||
}
|
||||
|
||||
/**
|
||||
* rxe_wqe_is_fenced - check if next wqe is fenced
|
||||
* @qp: the queue pair
|
||||
* @wqe: the next wqe
|
||||
*
|
||||
* Returns: 1 if wqe needs to wait
|
||||
* 0 if wqe is ready to go
|
||||
*/
|
||||
static int rxe_wqe_is_fenced(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
|
||||
{
|
||||
/* Local invalidate fence (LIF) see IBA 10.6.5.1
|
||||
* Requires ALL previous operations on the send queue
|
||||
* are complete. Make mandatory for the rxe driver.
|
||||
*/
|
||||
if (wqe->wr.opcode == IB_WR_LOCAL_INV)
|
||||
return qp->req.wqe_index != queue_get_consumer(qp->sq.queue,
|
||||
QUEUE_TYPE_FROM_CLIENT);
|
||||
|
||||
/* Fence see IBA 10.8.3.3
|
||||
* Requires that all previous read and atomic operations
|
||||
* are complete.
|
||||
*/
|
||||
return (wqe->wr.send_flags & IB_SEND_FENCE) &&
|
||||
atomic_read(&qp->req.rd_atomic) != qp->attr.max_rd_atomic;
|
||||
}
|
||||
|
||||
static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits)
|
||||
{
|
||||
switch (opcode) {
|
||||
@@ -581,9 +604,11 @@ static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
|
||||
wqe->status = IB_WC_SUCCESS;
|
||||
qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index);
|
||||
|
||||
if ((wqe->wr.send_flags & IB_SEND_SIGNALED) ||
|
||||
qp->sq_sig_type == IB_SIGNAL_ALL_WR)
|
||||
rxe_run_task(&qp->comp.task, 1);
|
||||
/* There is no ack coming for local work requests
|
||||
* which can lead to a deadlock. So go ahead and complete
|
||||
* it now.
|
||||
*/
|
||||
rxe_run_task(&qp->comp.task, 1);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -599,6 +624,7 @@ int rxe_requester(void *arg)
|
||||
u32 payload;
|
||||
int mtu;
|
||||
int opcode;
|
||||
int err;
|
||||
int ret;
|
||||
struct rxe_send_wqe rollback_wqe;
|
||||
u32 rollback_psn;
|
||||
@@ -609,10 +635,20 @@ int rxe_requester(void *arg)
|
||||
if (!rxe_get(qp))
|
||||
return -EAGAIN;
|
||||
|
||||
next_wqe:
|
||||
if (unlikely(!qp->valid || qp->req.state == QP_STATE_ERROR))
|
||||
if (unlikely(!qp->valid))
|
||||
goto exit;
|
||||
|
||||
if (unlikely(qp->req.state == QP_STATE_ERROR)) {
|
||||
wqe = req_next_wqe(qp);
|
||||
if (wqe)
|
||||
/*
|
||||
* Generate an error completion for error qp state
|
||||
*/
|
||||
goto err;
|
||||
else
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (unlikely(qp->req.state == QP_STATE_RESET)) {
|
||||
qp->req.wqe_index = queue_get_consumer(q,
|
||||
QUEUE_TYPE_FROM_CLIENT);
|
||||
@@ -620,10 +656,17 @@ next_wqe:
|
||||
qp->req.need_rd_atomic = 0;
|
||||
qp->req.wait_psn = 0;
|
||||
qp->req.need_retry = 0;
|
||||
qp->req.wait_for_rnr_timer = 0;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (unlikely(qp->req.need_retry)) {
|
||||
/* we come here if the retransmit timer has fired
|
||||
* or if the rnr timer has fired. If the retransmit
|
||||
* timer fires while we are processing an RNR NAK wait
|
||||
* until the rnr timer has fired before starting the
|
||||
* retry flow
|
||||
*/
|
||||
if (unlikely(qp->req.need_retry && !qp->req.wait_for_rnr_timer)) {
|
||||
req_retry(qp);
|
||||
qp->req.need_retry = 0;
|
||||
}
|
||||
@@ -632,12 +675,17 @@ next_wqe:
|
||||
if (unlikely(!wqe))
|
||||
goto exit;
|
||||
|
||||
if (rxe_wqe_is_fenced(qp, wqe)) {
|
||||
qp->req.wait_fence = 1;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (wqe->mask & WR_LOCAL_OP_MASK) {
|
||||
ret = rxe_do_local_ops(qp, wqe);
|
||||
if (unlikely(ret))
|
||||
err = rxe_do_local_ops(qp, wqe);
|
||||
if (unlikely(err))
|
||||
goto err;
|
||||
else
|
||||
goto next_wqe;
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (unlikely(qp_type(qp) == IB_QPT_RC &&
|
||||
@@ -685,9 +733,8 @@ next_wqe:
|
||||
qp->req.wqe_index);
|
||||
wqe->state = wqe_state_done;
|
||||
wqe->status = IB_WC_SUCCESS;
|
||||
__rxe_do_task(&qp->comp.task);
|
||||
rxe_put(qp);
|
||||
return 0;
|
||||
rxe_run_task(&qp->comp.task, 0);
|
||||
goto done;
|
||||
}
|
||||
payload = mtu;
|
||||
}
|
||||
@@ -703,25 +750,29 @@ next_wqe:
|
||||
if (unlikely(!av)) {
|
||||
pr_err("qp#%d Failed no address vector\n", qp_num(qp));
|
||||
wqe->status = IB_WC_LOC_QP_OP_ERR;
|
||||
goto err_drop_ah;
|
||||
goto err;
|
||||
}
|
||||
|
||||
skb = init_req_packet(qp, av, wqe, opcode, payload, &pkt);
|
||||
if (unlikely(!skb)) {
|
||||
pr_err("qp#%d Failed allocating skb\n", qp_num(qp));
|
||||
wqe->status = IB_WC_LOC_QP_OP_ERR;
|
||||
goto err_drop_ah;
|
||||
if (ah)
|
||||
rxe_put(ah);
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = finish_packet(qp, av, wqe, &pkt, skb, payload);
|
||||
if (unlikely(ret)) {
|
||||
err = finish_packet(qp, av, wqe, &pkt, skb, payload);
|
||||
if (unlikely(err)) {
|
||||
pr_debug("qp#%d Error during finish packet\n", qp_num(qp));
|
||||
if (ret == -EFAULT)
|
||||
if (err == -EFAULT)
|
||||
wqe->status = IB_WC_LOC_PROT_ERR;
|
||||
else
|
||||
wqe->status = IB_WC_LOC_QP_OP_ERR;
|
||||
kfree_skb(skb);
|
||||
goto err_drop_ah;
|
||||
if (ah)
|
||||
rxe_put(ah);
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (ah)
|
||||
@@ -736,13 +787,14 @@ next_wqe:
|
||||
save_state(wqe, qp, &rollback_wqe, &rollback_psn);
|
||||
update_wqe_state(qp, wqe, &pkt);
|
||||
update_wqe_psn(qp, wqe, &pkt, payload);
|
||||
ret = rxe_xmit_packet(qp, &pkt, skb);
|
||||
if (ret) {
|
||||
|
||||
err = rxe_xmit_packet(qp, &pkt, skb);
|
||||
if (err) {
|
||||
qp->need_req_skb = 1;
|
||||
|
||||
rollback_state(wqe, qp, &rollback_wqe, rollback_psn);
|
||||
|
||||
if (ret == -EAGAIN) {
|
||||
if (err == -EAGAIN) {
|
||||
rxe_run_task(&qp->req.task, 1);
|
||||
goto exit;
|
||||
}
|
||||
@@ -753,16 +805,23 @@ next_wqe:
|
||||
|
||||
update_state(qp, &pkt);
|
||||
|
||||
goto next_wqe;
|
||||
|
||||
err_drop_ah:
|
||||
if (ah)
|
||||
rxe_put(ah);
|
||||
/* A non-zero return value will cause rxe_do_task to
|
||||
* exit its loop and end the tasklet. A zero return
|
||||
* will continue looping and return to rxe_requester
|
||||
*/
|
||||
done:
|
||||
ret = 0;
|
||||
goto out;
|
||||
err:
|
||||
/* update wqe_index for each wqe completion */
|
||||
qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index);
|
||||
wqe->state = wqe_state_error;
|
||||
__rxe_do_task(&qp->comp.task);
|
||||
|
||||
qp->req.state = QP_STATE_ERROR;
|
||||
rxe_run_task(&qp->comp.task, 0);
|
||||
exit:
|
||||
ret = -EAGAIN;
|
||||
out:
|
||||
rxe_put(qp);
|
||||
return -EAGAIN;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@ enum resp_states {
|
||||
RESPST_CHK_RKEY,
|
||||
RESPST_EXECUTE,
|
||||
RESPST_READ_REPLY,
|
||||
RESPST_ATOMIC_REPLY,
|
||||
RESPST_COMPLETE,
|
||||
RESPST_ACKNOWLEDGE,
|
||||
RESPST_CLEANUP,
|
||||
@@ -55,6 +56,7 @@ static char *resp_state_name[] = {
|
||||
[RESPST_CHK_RKEY] = "CHK_RKEY",
|
||||
[RESPST_EXECUTE] = "EXECUTE",
|
||||
[RESPST_READ_REPLY] = "READ_REPLY",
|
||||
[RESPST_ATOMIC_REPLY] = "ATOMIC_REPLY",
|
||||
[RESPST_COMPLETE] = "COMPLETE",
|
||||
[RESPST_ACKNOWLEDGE] = "ACKNOWLEDGE",
|
||||
[RESPST_CLEANUP] = "CLEANUP",
|
||||
@@ -448,7 +450,8 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
|
||||
if (rkey_is_mw(rkey)) {
|
||||
mw = rxe_lookup_mw(qp, access, rkey);
|
||||
if (!mw) {
|
||||
pr_err("%s: no MW matches rkey %#x\n", __func__, rkey);
|
||||
pr_debug("%s: no MW matches rkey %#x\n",
|
||||
__func__, rkey);
|
||||
state = RESPST_ERR_RKEY_VIOLATION;
|
||||
goto err;
|
||||
}
|
||||
@@ -468,7 +471,8 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
|
||||
} else {
|
||||
mr = lookup_mr(qp->pd, access, rkey, RXE_LOOKUP_REMOTE);
|
||||
if (!mr) {
|
||||
pr_err("%s: no MR matches rkey %#x\n", __func__, rkey);
|
||||
pr_debug("%s: no MR matches rkey %#x\n",
|
||||
__func__, rkey);
|
||||
state = RESPST_ERR_RKEY_VIOLATION;
|
||||
goto err;
|
||||
}
|
||||
@@ -549,49 +553,106 @@ out:
|
||||
return rc;
|
||||
}
|
||||
|
||||
static struct resp_res *rxe_prepare_res(struct rxe_qp *qp,
|
||||
struct rxe_pkt_info *pkt,
|
||||
int type)
|
||||
{
|
||||
struct resp_res *res;
|
||||
u32 pkts;
|
||||
|
||||
res = &qp->resp.resources[qp->resp.res_head];
|
||||
rxe_advance_resp_resource(qp);
|
||||
free_rd_atomic_resource(res);
|
||||
|
||||
res->type = type;
|
||||
res->replay = 0;
|
||||
|
||||
switch (type) {
|
||||
case RXE_READ_MASK:
|
||||
res->read.va = qp->resp.va + qp->resp.offset;
|
||||
res->read.va_org = qp->resp.va + qp->resp.offset;
|
||||
res->read.resid = qp->resp.resid;
|
||||
res->read.length = qp->resp.resid;
|
||||
res->read.rkey = qp->resp.rkey;
|
||||
|
||||
pkts = max_t(u32, (reth_len(pkt) + qp->mtu - 1)/qp->mtu, 1);
|
||||
res->first_psn = pkt->psn;
|
||||
res->cur_psn = pkt->psn;
|
||||
res->last_psn = (pkt->psn + pkts - 1) & BTH_PSN_MASK;
|
||||
|
||||
res->state = rdatm_res_state_new;
|
||||
break;
|
||||
case RXE_ATOMIC_MASK:
|
||||
res->first_psn = pkt->psn;
|
||||
res->last_psn = pkt->psn;
|
||||
res->cur_psn = pkt->psn;
|
||||
break;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
/* Guarantee atomicity of atomic operations at the machine level. */
|
||||
static DEFINE_SPINLOCK(atomic_ops_lock);
|
||||
|
||||
static enum resp_states process_atomic(struct rxe_qp *qp,
|
||||
struct rxe_pkt_info *pkt)
|
||||
static enum resp_states atomic_reply(struct rxe_qp *qp,
|
||||
struct rxe_pkt_info *pkt)
|
||||
{
|
||||
u64 *vaddr;
|
||||
enum resp_states ret;
|
||||
struct rxe_mr *mr = qp->resp.mr;
|
||||
struct resp_res *res = qp->resp.res;
|
||||
u64 value;
|
||||
|
||||
if (mr->state != RXE_MR_STATE_VALID) {
|
||||
ret = RESPST_ERR_RKEY_VIOLATION;
|
||||
goto out;
|
||||
if (!res) {
|
||||
res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_MASK);
|
||||
qp->resp.res = res;
|
||||
}
|
||||
|
||||
vaddr = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, sizeof(u64));
|
||||
if (!res->replay) {
|
||||
if (mr->state != RXE_MR_STATE_VALID) {
|
||||
ret = RESPST_ERR_RKEY_VIOLATION;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* check vaddr is 8 bytes aligned. */
|
||||
if (!vaddr || (uintptr_t)vaddr & 7) {
|
||||
ret = RESPST_ERR_MISALIGNED_ATOMIC;
|
||||
goto out;
|
||||
vaddr = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset,
|
||||
sizeof(u64));
|
||||
|
||||
/* check vaddr is 8 bytes aligned. */
|
||||
if (!vaddr || (uintptr_t)vaddr & 7) {
|
||||
ret = RESPST_ERR_MISALIGNED_ATOMIC;
|
||||
goto out;
|
||||
}
|
||||
|
||||
spin_lock_bh(&atomic_ops_lock);
|
||||
res->atomic.orig_val = value = *vaddr;
|
||||
|
||||
if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP) {
|
||||
if (value == atmeth_comp(pkt))
|
||||
value = atmeth_swap_add(pkt);
|
||||
} else {
|
||||
value += atmeth_swap_add(pkt);
|
||||
}
|
||||
|
||||
*vaddr = value;
|
||||
spin_unlock_bh(&atomic_ops_lock);
|
||||
|
||||
qp->resp.msn++;
|
||||
|
||||
/* next expected psn, read handles this separately */
|
||||
qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
|
||||
qp->resp.ack_psn = qp->resp.psn;
|
||||
|
||||
qp->resp.opcode = pkt->opcode;
|
||||
qp->resp.status = IB_WC_SUCCESS;
|
||||
}
|
||||
|
||||
spin_lock_bh(&atomic_ops_lock);
|
||||
|
||||
qp->resp.atomic_orig = *vaddr;
|
||||
|
||||
if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP) {
|
||||
if (*vaddr == atmeth_comp(pkt))
|
||||
*vaddr = atmeth_swap_add(pkt);
|
||||
} else {
|
||||
*vaddr += atmeth_swap_add(pkt);
|
||||
}
|
||||
|
||||
spin_unlock_bh(&atomic_ops_lock);
|
||||
|
||||
ret = RESPST_NONE;
|
||||
ret = RESPST_ACKNOWLEDGE;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp,
|
||||
struct rxe_pkt_info *pkt,
|
||||
struct rxe_pkt_info *ack,
|
||||
int opcode,
|
||||
int payload,
|
||||
@@ -629,7 +690,7 @@ static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp,
|
||||
}
|
||||
|
||||
if (ack->mask & RXE_ATMACK_MASK)
|
||||
atmack_set_orig(ack, qp->resp.atomic_orig);
|
||||
atmack_set_orig(ack, qp->resp.res->atomic.orig_val);
|
||||
|
||||
err = rxe_prepare(&qp->pri_av, ack, skb);
|
||||
if (err) {
|
||||
@@ -640,34 +701,6 @@ static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp,
|
||||
return skb;
|
||||
}
|
||||
|
||||
static struct resp_res *rxe_prepare_read_res(struct rxe_qp *qp,
|
||||
struct rxe_pkt_info *pkt)
|
||||
{
|
||||
struct resp_res *res;
|
||||
u32 pkts;
|
||||
|
||||
res = &qp->resp.resources[qp->resp.res_head];
|
||||
rxe_advance_resp_resource(qp);
|
||||
free_rd_atomic_resource(qp, res);
|
||||
|
||||
res->type = RXE_READ_MASK;
|
||||
res->replay = 0;
|
||||
res->read.va = qp->resp.va + qp->resp.offset;
|
||||
res->read.va_org = qp->resp.va + qp->resp.offset;
|
||||
res->read.resid = qp->resp.resid;
|
||||
res->read.length = qp->resp.resid;
|
||||
res->read.rkey = qp->resp.rkey;
|
||||
|
||||
pkts = max_t(u32, (reth_len(pkt) + qp->mtu - 1)/qp->mtu, 1);
|
||||
res->first_psn = pkt->psn;
|
||||
res->cur_psn = pkt->psn;
|
||||
res->last_psn = (pkt->psn + pkts - 1) & BTH_PSN_MASK;
|
||||
|
||||
res->state = rdatm_res_state_new;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* rxe_recheck_mr - revalidate MR from rkey and get a reference
|
||||
* @qp: the qp
|
||||
@@ -738,7 +771,7 @@ static enum resp_states read_reply(struct rxe_qp *qp,
|
||||
struct rxe_mr *mr;
|
||||
|
||||
if (!res) {
|
||||
res = rxe_prepare_read_res(qp, req_pkt);
|
||||
res = rxe_prepare_res(qp, req_pkt, RXE_READ_MASK);
|
||||
qp->resp.res = res;
|
||||
}
|
||||
|
||||
@@ -771,7 +804,7 @@ static enum resp_states read_reply(struct rxe_qp *qp,
|
||||
|
||||
payload = min_t(int, res->read.resid, mtu);
|
||||
|
||||
skb = prepare_ack_packet(qp, req_pkt, &ack_pkt, opcode, payload,
|
||||
skb = prepare_ack_packet(qp, &ack_pkt, opcode, payload,
|
||||
res->cur_psn, AETH_ACK_UNLIMITED);
|
||||
if (!skb)
|
||||
return RESPST_ERR_RNR;
|
||||
@@ -858,9 +891,7 @@ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt)
|
||||
qp->resp.msn++;
|
||||
return RESPST_READ_REPLY;
|
||||
} else if (pkt->mask & RXE_ATOMIC_MASK) {
|
||||
err = process_atomic(qp, pkt);
|
||||
if (err)
|
||||
return err;
|
||||
return RESPST_ATOMIC_REPLY;
|
||||
} else {
|
||||
/* Unreachable */
|
||||
WARN_ON_ONCE(1);
|
||||
@@ -997,14 +1028,13 @@ finish:
|
||||
return RESPST_CLEANUP;
|
||||
}
|
||||
|
||||
static int send_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
|
||||
u8 syndrome, u32 psn)
|
||||
static int send_ack(struct rxe_qp *qp, u8 syndrome, u32 psn)
|
||||
{
|
||||
int err = 0;
|
||||
struct rxe_pkt_info ack_pkt;
|
||||
struct sk_buff *skb;
|
||||
|
||||
skb = prepare_ack_packet(qp, pkt, &ack_pkt, IB_OPCODE_RC_ACKNOWLEDGE,
|
||||
skb = prepare_ack_packet(qp, &ack_pkt, IB_OPCODE_RC_ACKNOWLEDGE,
|
||||
0, psn, syndrome);
|
||||
if (!skb) {
|
||||
err = -ENOMEM;
|
||||
@@ -1019,40 +1049,29 @@ err1:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
|
||||
u8 syndrome)
|
||||
static int send_atomic_ack(struct rxe_qp *qp, u8 syndrome, u32 psn)
|
||||
{
|
||||
int rc = 0;
|
||||
int err = 0;
|
||||
struct rxe_pkt_info ack_pkt;
|
||||
struct sk_buff *skb;
|
||||
struct resp_res *res;
|
||||
|
||||
skb = prepare_ack_packet(qp, pkt, &ack_pkt,
|
||||
IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, 0, pkt->psn,
|
||||
syndrome);
|
||||
skb = prepare_ack_packet(qp, &ack_pkt, IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE,
|
||||
0, psn, syndrome);
|
||||
if (!skb) {
|
||||
rc = -ENOMEM;
|
||||
err = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
res = &qp->resp.resources[qp->resp.res_head];
|
||||
free_rd_atomic_resource(qp, res);
|
||||
rxe_advance_resp_resource(qp);
|
||||
err = rxe_xmit_packet(qp, &ack_pkt, skb);
|
||||
if (err)
|
||||
pr_err_ratelimited("Failed sending atomic ack\n");
|
||||
|
||||
skb_get(skb);
|
||||
res->type = RXE_ATOMIC_MASK;
|
||||
res->atomic.skb = skb;
|
||||
res->first_psn = ack_pkt.psn;
|
||||
res->last_psn = ack_pkt.psn;
|
||||
res->cur_psn = ack_pkt.psn;
|
||||
|
||||
rc = rxe_xmit_packet(qp, &ack_pkt, skb);
|
||||
if (rc) {
|
||||
pr_err_ratelimited("Failed sending ack\n");
|
||||
rxe_put(qp);
|
||||
}
|
||||
/* have to clear this since it is used to trigger
|
||||
* long read replies
|
||||
*/
|
||||
qp->resp.res = NULL;
|
||||
out:
|
||||
return rc;
|
||||
return err;
|
||||
}
|
||||
|
||||
static enum resp_states acknowledge(struct rxe_qp *qp,
|
||||
@@ -1062,11 +1081,11 @@ static enum resp_states acknowledge(struct rxe_qp *qp,
|
||||
return RESPST_CLEANUP;
|
||||
|
||||
if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED)
|
||||
send_ack(qp, pkt, qp->resp.aeth_syndrome, pkt->psn);
|
||||
send_ack(qp, qp->resp.aeth_syndrome, pkt->psn);
|
||||
else if (pkt->mask & RXE_ATOMIC_MASK)
|
||||
send_atomic_ack(qp, pkt, AETH_ACK_UNLIMITED);
|
||||
send_atomic_ack(qp, AETH_ACK_UNLIMITED, pkt->psn);
|
||||
else if (bth_ack(pkt))
|
||||
send_ack(qp, pkt, AETH_ACK_UNLIMITED, pkt->psn);
|
||||
send_ack(qp, AETH_ACK_UNLIMITED, pkt->psn);
|
||||
|
||||
return RESPST_CLEANUP;
|
||||
}
|
||||
@@ -1119,7 +1138,7 @@ static enum resp_states duplicate_request(struct rxe_qp *qp,
|
||||
if (pkt->mask & RXE_SEND_MASK ||
|
||||
pkt->mask & RXE_WRITE_MASK) {
|
||||
/* SEND. Ack again and cleanup. C9-105. */
|
||||
send_ack(qp, pkt, AETH_ACK_UNLIMITED, prev_psn);
|
||||
send_ack(qp, AETH_ACK_UNLIMITED, prev_psn);
|
||||
return RESPST_CLEANUP;
|
||||
} else if (pkt->mask & RXE_READ_MASK) {
|
||||
struct resp_res *res;
|
||||
@@ -1173,14 +1192,11 @@ static enum resp_states duplicate_request(struct rxe_qp *qp,
|
||||
/* Find the operation in our list of responder resources. */
|
||||
res = find_resource(qp, pkt->psn);
|
||||
if (res) {
|
||||
skb_get(res->atomic.skb);
|
||||
/* Resend the result. */
|
||||
rc = rxe_xmit_packet(qp, pkt, res->atomic.skb);
|
||||
if (rc) {
|
||||
pr_err("Failed resending result. This flow is not handled - skb ignored\n");
|
||||
rc = RESPST_CLEANUP;
|
||||
goto out;
|
||||
}
|
||||
res->replay = 1;
|
||||
res->cur_psn = pkt->psn;
|
||||
qp->resp.res = res;
|
||||
rc = RESPST_ATOMIC_REPLY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Resource not found. Class D error. Drop the request. */
|
||||
@@ -1260,17 +1276,15 @@ int rxe_responder(void *arg)
|
||||
struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
|
||||
enum resp_states state;
|
||||
struct rxe_pkt_info *pkt = NULL;
|
||||
int ret = 0;
|
||||
int ret;
|
||||
|
||||
if (!rxe_get(qp))
|
||||
return -EAGAIN;
|
||||
|
||||
qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED;
|
||||
|
||||
if (!qp->valid) {
|
||||
ret = -EINVAL;
|
||||
goto done;
|
||||
}
|
||||
if (!qp->valid)
|
||||
goto exit;
|
||||
|
||||
switch (qp->resp.state) {
|
||||
case QP_STATE_RESET:
|
||||
@@ -1316,6 +1330,9 @@ int rxe_responder(void *arg)
|
||||
case RESPST_READ_REPLY:
|
||||
state = read_reply(qp, pkt);
|
||||
break;
|
||||
case RESPST_ATOMIC_REPLY:
|
||||
state = atomic_reply(qp, pkt);
|
||||
break;
|
||||
case RESPST_ACKNOWLEDGE:
|
||||
state = acknowledge(qp, pkt);
|
||||
break;
|
||||
@@ -1327,7 +1344,7 @@ int rxe_responder(void *arg)
|
||||
break;
|
||||
case RESPST_ERR_PSN_OUT_OF_SEQ:
|
||||
/* RC only - Class B. Drop packet. */
|
||||
send_ack(qp, pkt, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn);
|
||||
send_ack(qp, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn);
|
||||
state = RESPST_CLEANUP;
|
||||
break;
|
||||
|
||||
@@ -1349,7 +1366,7 @@ int rxe_responder(void *arg)
|
||||
if (qp_type(qp) == IB_QPT_RC) {
|
||||
rxe_counter_inc(rxe, RXE_CNT_SND_RNR);
|
||||
/* RC - class B */
|
||||
send_ack(qp, pkt, AETH_RNR_NAK |
|
||||
send_ack(qp, AETH_RNR_NAK |
|
||||
(~AETH_TYPE_MASK &
|
||||
qp->attr.min_rnr_timer),
|
||||
pkt->psn);
|
||||
@@ -1438,7 +1455,7 @@ int rxe_responder(void *arg)
|
||||
|
||||
case RESPST_ERROR:
|
||||
qp->resp.goto_error = 0;
|
||||
pr_warn("qp#%d moved to error state\n", qp_num(qp));
|
||||
pr_debug("qp#%d moved to error state\n", qp_num(qp));
|
||||
rxe_qp_error(qp);
|
||||
goto exit;
|
||||
|
||||
@@ -1447,9 +1464,16 @@ int rxe_responder(void *arg)
|
||||
}
|
||||
}
|
||||
|
||||
/* A non-zero return value will cause rxe_do_task to
|
||||
* exit its loop and end the tasklet. A zero return
|
||||
* will continue looping and return to rxe_responder
|
||||
*/
|
||||
done:
|
||||
ret = 0;
|
||||
goto out;
|
||||
exit:
|
||||
ret = -EAGAIN;
|
||||
done:
|
||||
out:
|
||||
rxe_put(qp);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/hardirq.h>
|
||||
|
||||
#include "rxe_task.h"
|
||||
#include "rxe.h"
|
||||
|
||||
int __rxe_do_task(struct rxe_task *task)
|
||||
|
||||
@@ -33,6 +33,7 @@ void rxe_do_task(struct tasklet_struct *t)
|
||||
int cont;
|
||||
int ret;
|
||||
struct rxe_task *task = from_tasklet(task, t, tasklet);
|
||||
unsigned int iterations = RXE_MAX_ITERATIONS;
|
||||
|
||||
spin_lock_bh(&task->state_lock);
|
||||
switch (task->state) {
|
||||
@@ -61,13 +62,20 @@ void rxe_do_task(struct tasklet_struct *t)
|
||||
spin_lock_bh(&task->state_lock);
|
||||
switch (task->state) {
|
||||
case TASK_STATE_BUSY:
|
||||
if (ret)
|
||||
if (ret) {
|
||||
task->state = TASK_STATE_START;
|
||||
else
|
||||
} else if (iterations--) {
|
||||
cont = 1;
|
||||
} else {
|
||||
/* reschedule the tasklet and exit
|
||||
* the loop to give up the cpu
|
||||
*/
|
||||
tasklet_schedule(&task->tasklet);
|
||||
task->state = TASK_STATE_START;
|
||||
}
|
||||
break;
|
||||
|
||||
/* soneone tried to run the task since the last time we called
|
||||
/* someone tried to run the task since the last time we called
|
||||
* func, so we will call one more time regardless of the
|
||||
* return value
|
||||
*/
|
||||
|
||||
@@ -115,7 +115,7 @@ static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc)
|
||||
{
|
||||
struct rxe_ucontext *uc = to_ruc(ibuc);
|
||||
|
||||
rxe_put(uc);
|
||||
rxe_cleanup(uc);
|
||||
}
|
||||
|
||||
static int rxe_port_immutable(struct ib_device *dev, u32 port_num,
|
||||
@@ -149,7 +149,7 @@ static int rxe_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
|
||||
{
|
||||
struct rxe_pd *pd = to_rpd(ibpd);
|
||||
|
||||
rxe_put(pd);
|
||||
rxe_cleanup(pd);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -176,7 +176,8 @@ static int rxe_create_ah(struct ib_ah *ibah,
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = rxe_add_to_pool(&rxe->ah_pool, ah);
|
||||
err = rxe_add_to_pool_ah(&rxe->ah_pool, ah,
|
||||
init_attr->flags & RDMA_CREATE_AH_SLEEPABLE);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
@@ -188,7 +189,7 @@ static int rxe_create_ah(struct ib_ah *ibah,
|
||||
err = copy_to_user(&uresp->ah_num, &ah->ah_num,
|
||||
sizeof(uresp->ah_num));
|
||||
if (err) {
|
||||
rxe_put(ah);
|
||||
rxe_cleanup(ah);
|
||||
return -EFAULT;
|
||||
}
|
||||
} else if (ah->is_user) {
|
||||
@@ -197,6 +198,8 @@ static int rxe_create_ah(struct ib_ah *ibah,
|
||||
}
|
||||
|
||||
rxe_init_av(init_attr->ah_attr, &ah->av);
|
||||
rxe_finalize(ah);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -228,7 +231,8 @@ static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags)
|
||||
{
|
||||
struct rxe_ah *ah = to_rah(ibah);
|
||||
|
||||
rxe_put(ah);
|
||||
rxe_cleanup_ah(ah, flags & RDMA_DESTROY_AH_SLEEPABLE);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -308,12 +312,13 @@ static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init,
|
||||
|
||||
err = rxe_srq_from_init(rxe, srq, init, udata, uresp);
|
||||
if (err)
|
||||
goto err_put;
|
||||
goto err_cleanup;
|
||||
|
||||
return 0;
|
||||
|
||||
err_put:
|
||||
rxe_put(srq);
|
||||
err_cleanup:
|
||||
rxe_cleanup(srq);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
@@ -362,7 +367,7 @@ static int rxe_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
|
||||
{
|
||||
struct rxe_srq *srq = to_rsrq(ibsrq);
|
||||
|
||||
rxe_put(srq);
|
||||
rxe_cleanup(srq);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -429,10 +434,11 @@ static int rxe_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init,
|
||||
if (err)
|
||||
goto qp_init;
|
||||
|
||||
rxe_finalize(qp);
|
||||
return 0;
|
||||
|
||||
qp_init:
|
||||
rxe_put(qp);
|
||||
rxe_cleanup(qp);
|
||||
return err;
|
||||
}
|
||||
|
||||
@@ -485,7 +491,7 @@ static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
rxe_put(qp);
|
||||
rxe_cleanup(qp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -803,7 +809,7 @@ static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
|
||||
|
||||
rxe_cq_disable(cq);
|
||||
|
||||
rxe_put(cq);
|
||||
rxe_cleanup(cq);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -898,6 +904,7 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access)
|
||||
|
||||
rxe_get(pd);
|
||||
rxe_mr_init_dma(pd, access, mr);
|
||||
rxe_finalize(mr);
|
||||
|
||||
return &mr->ibmr;
|
||||
}
|
||||
@@ -926,11 +933,13 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd,
|
||||
if (err)
|
||||
goto err3;
|
||||
|
||||
rxe_finalize(mr);
|
||||
|
||||
return &mr->ibmr;
|
||||
|
||||
err3:
|
||||
rxe_put(pd);
|
||||
rxe_put(mr);
|
||||
rxe_cleanup(mr);
|
||||
err2:
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
@@ -958,35 +967,52 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
|
||||
if (err)
|
||||
goto err2;
|
||||
|
||||
rxe_finalize(mr);
|
||||
|
||||
return &mr->ibmr;
|
||||
|
||||
err2:
|
||||
rxe_put(pd);
|
||||
rxe_put(mr);
|
||||
rxe_cleanup(mr);
|
||||
err1:
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
/* build next_map_set from scatterlist
|
||||
* The IB_WR_REG_MR WR will swap map_sets
|
||||
*/
|
||||
static int rxe_set_page(struct ib_mr *ibmr, u64 addr)
|
||||
{
|
||||
struct rxe_mr *mr = to_rmr(ibmr);
|
||||
struct rxe_map *map;
|
||||
struct rxe_phys_buf *buf;
|
||||
|
||||
if (unlikely(mr->nbuf == mr->num_buf))
|
||||
return -ENOMEM;
|
||||
|
||||
map = mr->map[mr->nbuf / RXE_BUF_PER_MAP];
|
||||
buf = &map->buf[mr->nbuf % RXE_BUF_PER_MAP];
|
||||
|
||||
buf->addr = addr;
|
||||
buf->size = ibmr->page_size;
|
||||
mr->nbuf++;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
|
||||
int sg_nents, unsigned int *sg_offset)
|
||||
{
|
||||
struct rxe_mr *mr = to_rmr(ibmr);
|
||||
struct rxe_map_set *set = mr->next_map_set;
|
||||
int n;
|
||||
|
||||
set->nbuf = 0;
|
||||
mr->nbuf = 0;
|
||||
|
||||
n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rxe_mr_set_page);
|
||||
n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rxe_set_page);
|
||||
|
||||
set->va = ibmr->iova;
|
||||
set->iova = ibmr->iova;
|
||||
set->length = ibmr->length;
|
||||
set->page_shift = ilog2(ibmr->page_size);
|
||||
set->page_mask = ibmr->page_size - 1;
|
||||
set->offset = set->iova & set->page_mask;
|
||||
mr->va = ibmr->iova;
|
||||
mr->iova = ibmr->iova;
|
||||
mr->length = ibmr->length;
|
||||
mr->page_shift = ilog2(ibmr->page_size);
|
||||
mr->page_mask = ibmr->page_size - 1;
|
||||
mr->offset = mr->iova & mr->page_mask;
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <rdma/rdma_user_rxe.h>
|
||||
#include "rxe_pool.h"
|
||||
#include "rxe_task.h"
|
||||
#include "rxe_hw_counters.h"
|
||||
@@ -124,11 +123,13 @@ struct rxe_req_info {
|
||||
int need_rd_atomic;
|
||||
int wait_psn;
|
||||
int need_retry;
|
||||
int wait_for_rnr_timer;
|
||||
int noack_pkts;
|
||||
struct rxe_task task;
|
||||
};
|
||||
|
||||
struct rxe_comp_info {
|
||||
enum rxe_qp_state state;
|
||||
u32 psn;
|
||||
int opcode;
|
||||
int timeout;
|
||||
@@ -155,7 +156,7 @@ struct resp_res {
|
||||
|
||||
union {
|
||||
struct {
|
||||
struct sk_buff *skb;
|
||||
u64 orig_val;
|
||||
} atomic;
|
||||
struct {
|
||||
u64 va_org;
|
||||
@@ -189,7 +190,6 @@ struct rxe_resp_info {
|
||||
u32 resid;
|
||||
u32 rkey;
|
||||
u32 length;
|
||||
u64 atomic_orig;
|
||||
|
||||
/* SRQ only */
|
||||
struct {
|
||||
@@ -288,17 +288,6 @@ struct rxe_map {
|
||||
struct rxe_phys_buf buf[RXE_BUF_PER_MAP];
|
||||
};
|
||||
|
||||
struct rxe_map_set {
|
||||
struct rxe_map **map;
|
||||
u64 va;
|
||||
u64 iova;
|
||||
size_t length;
|
||||
u32 offset;
|
||||
u32 nbuf;
|
||||
int page_shift;
|
||||
int page_mask;
|
||||
};
|
||||
|
||||
static inline int rkey_is_mw(u32 rkey)
|
||||
{
|
||||
u32 index = rkey >> 8;
|
||||
@@ -316,20 +305,26 @@ struct rxe_mr {
|
||||
u32 rkey;
|
||||
enum rxe_mr_state state;
|
||||
enum ib_mr_type type;
|
||||
u64 va;
|
||||
u64 iova;
|
||||
size_t length;
|
||||
u32 offset;
|
||||
int access;
|
||||
|
||||
int page_shift;
|
||||
int page_mask;
|
||||
int map_shift;
|
||||
int map_mask;
|
||||
|
||||
u32 num_buf;
|
||||
u32 nbuf;
|
||||
|
||||
u32 max_buf;
|
||||
u32 num_map;
|
||||
|
||||
atomic_t num_mw;
|
||||
|
||||
struct rxe_map_set *cur_map_set;
|
||||
struct rxe_map_set *next_map_set;
|
||||
struct rxe_map **map;
|
||||
};
|
||||
|
||||
enum rxe_mw_state {
|
||||
|
||||
@@ -725,11 +725,11 @@ static int siw_proc_mpareply(struct siw_cep *cep)
|
||||
enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR;
|
||||
|
||||
rv = siw_recv_mpa_rr(cep);
|
||||
if (rv != -EAGAIN)
|
||||
siw_cancel_mpatimer(cep);
|
||||
if (rv)
|
||||
goto out_err;
|
||||
|
||||
siw_cancel_mpatimer(cep);
|
||||
|
||||
rep = &cep->mpa.hdr;
|
||||
|
||||
if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) {
|
||||
@@ -895,7 +895,8 @@ static int siw_proc_mpareply(struct siw_cep *cep)
|
||||
}
|
||||
|
||||
out_err:
|
||||
siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL);
|
||||
if (rv != -EAGAIN)
|
||||
siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL);
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
@@ -1167,7 +1167,7 @@ int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
|
||||
err_out:
|
||||
siw_dbg(base_cq->device, "CQ creation failed: %d", rv);
|
||||
|
||||
if (cq && cq->queue) {
|
||||
if (cq->queue) {
|
||||
struct siw_ucontext *ctx =
|
||||
rdma_udata_to_drv_context(udata, struct siw_ucontext,
|
||||
base_ucontext);
|
||||
|
||||
@@ -1109,7 +1109,7 @@ static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv)
|
||||
* if he sets the device address back to be based on GID index 0,
|
||||
* he no longer wishs to control it.
|
||||
*
|
||||
* If the user doesn't control the the device address,
|
||||
* If the user doesn't control the device address,
|
||||
* IPOIB_FLAG_DEV_ADDR_SET is set and ib_find_gid failed it means
|
||||
* the port GUID has changed and GID at index 0 has changed
|
||||
* so we need to change priv->local_gid and priv->dev->dev_addr
|
||||
|
||||
@@ -1664,8 +1664,10 @@ static void ipoib_napi_add(struct net_device *dev)
|
||||
{
|
||||
struct ipoib_dev_priv *priv = ipoib_priv(dev);
|
||||
|
||||
netif_napi_add(dev, &priv->recv_napi, ipoib_rx_poll, IPOIB_NUM_WC);
|
||||
netif_napi_add(dev, &priv->send_napi, ipoib_tx_poll, MAX_SEND_CQE);
|
||||
netif_napi_add_weight(dev, &priv->recv_napi, ipoib_rx_poll,
|
||||
IPOIB_NUM_WC);
|
||||
netif_napi_add_weight(dev, &priv->send_napi, ipoib_tx_poll,
|
||||
MAX_SEND_CQE);
|
||||
}
|
||||
|
||||
static void ipoib_napi_del(struct net_device *dev)
|
||||
|
||||
@@ -246,6 +246,7 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
|
||||
device = ib_conn->device;
|
||||
ib_dev = device->ib_device;
|
||||
|
||||
/* +1 for drain */
|
||||
if (ib_conn->pi_support)
|
||||
max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1;
|
||||
else
|
||||
@@ -267,7 +268,8 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
|
||||
init_attr.qp_context = (void *)ib_conn;
|
||||
init_attr.send_cq = ib_conn->cq;
|
||||
init_attr.recv_cq = ib_conn->cq;
|
||||
init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS;
|
||||
/* +1 for drain */
|
||||
init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS + 1;
|
||||
init_attr.cap.max_send_sge = 2;
|
||||
init_attr.cap.max_recv_sge = 1;
|
||||
init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
|
||||
@@ -485,7 +487,7 @@ int iser_conn_terminate(struct iser_conn *iser_conn)
|
||||
iser_conn, err);
|
||||
|
||||
/* block until all flush errors are consumed */
|
||||
ib_drain_sq(ib_conn->qp);
|
||||
ib_drain_qp(ib_conn->qp);
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user