Re: [RFC PATCH 03/11] rust: block: introduce `kernel::block::mq` module

From: Andreas Hindborg (Samsung)
Date: Tue Jan 23 2024 - 09:00:47 EST



Hi Benno,

Benno Lossin <benno.lossin@xxxxxxxxx> writes:

<...>

>> diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs
>> new file mode 100644
>> index 000000000000..50496af15bbf
>> --- /dev/null
>> +++ b/rust/kernel/block/mq/gen_disk.rs
>> @@ -0,0 +1,133 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +//! GenDisk abstraction
>> +//!
>> +//! C header: [`include/linux/blkdev.h`](../../include/linux/blkdev.h)
>> +//! C header: [`include/linux/blk_mq.h`](../../include/linux/blk_mq.h)
>> +
>> +use crate::block::mq::{raw_writer::RawWriter, Operations, TagSet};
>> +use crate::{
>> + bindings, error::from_err_ptr, error::Result, sync::Arc, types::ForeignOwnable,
>> + types::ScopeGuard,
>> +};
>> +use core::fmt::{self, Write};
>> +
>> +/// A generic block device
>> +///
>> +/// # Invariants
>> +///
>> +/// - `gendisk` must always point to an initialized and valid `struct gendisk`.
>> +pub struct GenDisk<T: Operations> {
>> + _tagset: Arc<TagSet<T>>,
>> + gendisk: *mut bindings::gendisk,
>
> Why are these two fields not embedded? Shouldn't the user decide where
> to allocate?

The `TagSet` can be shared between multiple `GenDisk`. Using an `Arc`
seems resonable?

For the `gendisk` field, the allocation is done by C and the address
must be stable. We are owning the pointee and must drop it when it goes out
of scope. I could do this:

#[repr(transparent)]
struct GenDisk(Opaque<bindings::gendisk>);

struct UniqueGenDiskRef {
_tagset: Arc<TagSet<T>>,
gendisk: Pin<&'static mut GenDisk>,

}

but it seems pointless. `struct GenDisk` would not be pub in that case. What do you think?

>
>> +}
>> +
>> +// SAFETY: `GenDisk` is an owned pointer to a `struct gendisk` and an `Arc` to a
>> +// `TagSet` It is safe to send this to other threads as long as T is Send.
>> +unsafe impl<T: Operations + Send> Send for GenDisk<T> {}
>> +
>> +impl<T: Operations> GenDisk<T> {
>> + /// Try to create a new `GenDisk`
>> + pub fn try_new(tagset: Arc<TagSet<T>>, queue_data: T::QueueData) -> Result<Self> {
>> + let data = queue_data.into_foreign();
>> + let recover_data = ScopeGuard::new(|| {
>> + // SAFETY: T::QueueData was created by the call to `into_foreign()` above
>> + unsafe { T::QueueData::from_foreign(data) };
>> + });
>> +
>> + let lock_class_key = crate::sync::LockClassKey::new();
>> +
>> + // SAFETY: `tagset.raw_tag_set()` points to a valid and initialized tag set
>> + let gendisk = from_err_ptr(unsafe {
>> + bindings::__blk_mq_alloc_disk(tagset.raw_tag_set(), data as _, lock_class_key.as_ptr())
>
> Avoid `as _` casts.

👍

>
>> + })?;
>> +
>> + const TABLE: bindings::block_device_operations = bindings::block_device_operations {
>> + submit_bio: None,
>> + open: None,
>> + release: None,
>> + ioctl: None,
>> + compat_ioctl: None,
>> + check_events: None,
>> + unlock_native_capacity: None,
>> + getgeo: None,
>> + set_read_only: None,
>> + swap_slot_free_notify: None,
>> + report_zones: None,
>> + devnode: None,
>> + alternative_gpt_sector: None,
>> + get_unique_id: None,
>> + owner: core::ptr::null_mut(),
>> + pr_ops: core::ptr::null_mut(),
>> + free_disk: None,
>> + poll_bio: None,
>> + };
>> +
>> + // SAFETY: gendisk is a valid pointer as we initialized it above
>> + unsafe { (*gendisk).fops = &TABLE };
>> +
>> + recover_data.dismiss();
>> + Ok(Self {
>> + _tagset: tagset,
>> + gendisk,
>> + })
>> + }
>> +
>> + /// Set the name of the device
>> + pub fn set_name(&self, args: fmt::Arguments<'_>) -> Result {
>> + let mut raw_writer = RawWriter::from_array(unsafe { &mut (*self.gendisk).disk_name });
>
> Missing `SAFETY` also see below.

Yes, I have a few of those. Will add for next version.

>
>> + raw_writer.write_fmt(args)?;
>> + raw_writer.write_char('\0')?;
>> + Ok(())
>> + }
>> +
>> + /// Register the device with the kernel. When this function return, the
>> + /// device is accessible from VFS. The kernel may issue reads to the device
>> + /// during registration to discover partition infomation.
>> + pub fn add(&self) -> Result {
>> + crate::error::to_result(unsafe {
>> + bindings::device_add_disk(core::ptr::null_mut(), self.gendisk, core::ptr::null_mut())
>> + })
>> + }
>> +
>> + /// Call to tell the block layer the capcacity of the device
>> + pub fn set_capacity(&self, sectors: u64) {
>> + unsafe { bindings::set_capacity(self.gendisk, sectors) };
>> + }
>> +
>> + /// Set the logical block size of the device
>> + pub fn set_queue_logical_block_size(&self, size: u32) {
>> + unsafe { bindings::blk_queue_logical_block_size((*self.gendisk)queue, size) };
>> + }
>> +
>> + /// Set the physical block size of the device
>
> What does this *do*? I do not think the doc string gives any meaningful
> information not present in the function name (this might just be,
> because I have no idea of what this is and anyone with just a little
> more knowledge would know, but I still wanted to mention it).

I'll add some more context.

>
>> + pub fn set_queue_physical_block_size(&self, size: u32) {
>> + unsafe { bindings::blk_queue_physical_block_size((*self.gendisk).queue, size) };
>> + }
>> +
>> + /// Set the rotational media attribute for the device
>> + pub fn set_rotational(&self, rotational: bool) {
>> + if !rotational {
>> + unsafe {
>> + bindings::blk_queue_flag_set(bindings::QUEUE_FLAG_NONROT, (*self.gendisk).queue)
>> + };
>> + } else {
>> + unsafe {
>> + bindings::blk_queue_flag_clear(bindings::QUEUE_FLAG_NONROT, (*self.gendisk).queue)
>> + };
>> + }
>> + }
>> +}
>> +
>> +impl<T: Operations> Drop for GenDisk<T> {
>> + fn drop(&mut self) {
>> + let queue_data = unsafe { (*(*self.gendisk).queue).queuedata };
>> +
>> + unsafe { bindings::del_gendisk(self.gendisk) };
>> +
>> + // SAFETY: `queue.queuedata` was created by `GenDisk::try_new()` with a
>> + // call to `ForeignOwnable::into_pointer()` to create `queuedata`.
>> + // `ForeignOwnable::from_foreign()` is only called here.
>> + let _queue_data = unsafe { T::QueueData::from_foreign(queue_data) };
>> + }
>> +}
>> diff --git a/rust/kernel/block/mq/operations.rs b/rust/kernel/block/mq/operations.rs
>> new file mode 100644
>> index 000000000000..fb1ab707d1f0
>> --- /dev/null
>> +++ b/rust/kernel/block/mq/operations.rs
>> @@ -0,0 +1,260 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +//! This module provides an interface for blk-mq drivers to implement.
>> +//!
>> +//! C header: [`include/linux/blk-mq.h`](../../include/linux/blk-mq.h)
>> +
>> +use crate::{
>> + bindings,
>> + block::mq::{tag_set::TagSetRef, Request},
>> + error::{from_result, Result},
>> + types::ForeignOwnable,
>> +};
>> +use core::{marker::PhantomData, pin::Pin};
>> +
>> +/// Implement this trait to interface blk-mq as block devices
>> +#[macros::vtable]
>> +pub trait Operations: Sized {
>
> Is this trait really safe? Are there **no** requirements for e.g.
> `QueueData`? So could I use `Box<()>`?

Yes, it is intended to be safe. `ForeignOwnable` covers safety
requirements for these associated data types.

>
>> + /// Data associated with a request. This data is located next to the request
>> + /// structure.
>> + type RequestData;
>> +
>> + /// Data associated with the `struct request_queue` that is allocated for
>> + /// the `GenDisk` associated with this `Operations` implementation.
>> + type QueueData: ForeignOwnable;
>> +
>> + /// Data associated with a dispatch queue. This is stored as a pointer in
>> + /// `struct blk_mq_hw_ctx`.
>> + type HwData: ForeignOwnable;
>> +
>> + /// Data associated with a tag set. This is stored as a pointer in `struct
>> + /// blk_mq_tag_set`.
>> + type TagSetData: ForeignOwnable;
>> +
>> + /// Called by the kernel to allocate a new `RequestData`. The structure will
>> + /// eventually be pinned, so defer initialization to `init_request_data()`
>> + fn new_request_data(
>> + _tagset_data: <Self::TagSetData as ForeignOwnable>::Borrowed<'_>,
>> + ) -> Result<Self::RequestData>;
>> +
>> + /// Called by the kernel to initialize a previously allocated `RequestData`
>> + fn init_request_data(
>> + _tagset_data: <Self::TagSetData as ForeignOwnable>::Borrowed<'_>,
>> + _data: Pin<&mut Self::RequestData>,
>> + ) -> Result {
>> + Ok(())
>> + }
>> +
>> + /// Called by the kernel to queue a request with the driver. If `is_last` is
>> + /// `false`, the driver is allowed to defer commiting the request.
>> + fn queue_rq(
>> + hw_data: <Self::HwData as ForeignOwnable>::Borrowed<'_>,
>> + queue_data: <Self::QueueData as ForeignOwnable>::Borrowed<'_>,
>> + rq: &Request<Self>,
>> + is_last: bool,
>> + ) -> Result;
>> +
>> + /// Called by the kernel to indicate that queued requests should be submitted
>> + fn commit_rqs(
>> + hw_data: <Self::HwData as ForeignOwnable>::Borrowed<'_>,
>> + queue_data: <Self::QueueData as ForeignOwnable>::Borrowed<'_>,
>> + );
>> +
>> + /// Called by the kernel when the request is completed
>> + fn complete(_rq: &Request<Self>);
>> +
>> + /// Called by the kernel to allocate and initialize a driver specific hardware context data
>> + fn init_hctx(
>> + tagset_data: <Self::TagSetData as ForeignOwnable>::Borrowed<'_>,
>> + hctx_idx: u32,
>> + ) -> Result<Self::HwData>;
>> +
>> + /// Called by the kernel to poll the device for completed requests. Only used for poll queues.
>> + fn poll(_hw_data: <Self::HwData as ForeignOwnable>::Borrowed<'_>) -> i32 {
>> + unreachable!()
>
> Why are these implemented this way? Should this really panic? Maybe
> return an error? Why `i32` as the return type? If it can error it should
> be `Result<u32>`.

I will update in accordance with the new documentation for `#[vtable]`.

Return type should be `bool`, I will change it. It inherited the int
from `core::ffi::c_int`.

>
>> + }
>> +
>> + /// Called by the kernel to map submission queues to CPU cores.
>> + fn map_queues(_tag_set: &TagSetRef) {
>> + unreachable!()
>> + }
>> +
>> + // There is no need for exit_request() because `drop` will be called.
>> +}
>> +
>> +pub(crate) struct OperationsVtable<T: Operations>(PhantomData<T>);
>> +
>> +impl<T: Operations> OperationsVtable<T> {
>> + // # Safety
>> + //
>> + // The caller of this function must ensure that `hctx` and `bd` are valid
>> + // and initialized. The pointees must outlive this function. Further
>> + // `hctx->driver_data` must be a pointer created by a call to
>> + // `Self::init_hctx_callback()` and the pointee must outlive this function.
>> + // This function must not be called with a `hctx` for which
>> + // `Self::exit_hctx_callback()` has been called.
>> + unsafe extern "C" fn queue_rq_callback(
>> + hctx: *mut bindings::blk_mq_hw_ctx,
>> + bd: *const bindings::blk_mq_queue_data,
>> + ) -> bindings::blk_status_t {
>> + // SAFETY: `bd` is valid as required by the safety requirement for this function.
>> + let rq = unsafe { (*bd).rq };
>> +
>> + // SAFETY: The safety requirement for this function ensure that
>> + // `(*hctx).driver_data` was returned by a call to
>> + // `Self::init_hctx_callback()`. That function uses
>> + // `PointerWrapper::into_pointer()` to create `driver_data`. Further,
>> + // the returned value does not outlive this function and
>> + // `from_foreign()` is not called until `Self::exit_hctx_callback()` is
>> + // called. By the safety requirement of this function and contract with
>> + // the `blk-mq` API, `queue_rq_callback()` will not be called after that
>> + // point.
>
> This safety section and the others here are rather long and mostly
> repeat themselves. Is it possible to put this in its own module and
> explain the safety invariants in that module and then in these safety
> sections just refer to some labels from that section?
>
> I think we should discuss this in our next meeting.

Not sure about the best way to do this. Lets talk.

<...>

>> +
>> + pub(crate) const unsafe fn build() -> &'static bindings::blk_mq_ops {
>> + &Self::VTABLE
>> + }
>
> Why is this function `unsafe`?

I don't think it needs to be unsafe, thanks.

>
>> +}
>
> Some `# Safety` and `SAFETY` missing in this hunk.
>
>> diff --git a/rust/kernel/block/mq/raw_writer.rs b/rust/kernel/block/mq/raw_writer.rs
>> new file mode 100644
>> index 000000000000..25c16ee0b1f7
>> --- /dev/null
>> +++ b/rust/kernel/block/mq/raw_writer.rs
>> @@ -0,0 +1,30 @@
>> +use core::fmt::{self, Write};
>> +
>> +pub(crate) struct RawWriter {
>> + ptr: *mut u8,
>> + len: usize,
>> +}
>> +
>> +impl RawWriter {
>> + unsafe fn new(ptr: *mut u8, len: usize) -> Self {
>> + Self { ptr, len }
>> + }
>> +
>> + pub(crate) fn from_array<const N: usize>(a: &mut [core::ffi::c_char; N]) -> Self {
>> + unsafe { Self::new(&mut a[0] as *mut _ as _, N) }
>> + }
>
> This function needs to be `unsafe`, because it never captures the
> lifetime of `a`. I can write:
> let mut a = Box::new([0; 10]);
> let mut writer = RawWriter::from_array(&mut *a);
> drop(a);
> writer.write_str("Abc"); // UAF
> Alternatively add a lifetime to `RawWriter`.

Yes, a lifetime is missing in RawWriter, thanks.

>
>> +}
>> +
>> +impl Write for RawWriter {
>> + fn write_str(&mut self, s: &str) -> fmt::Result {
>> + let bytes = s.as_bytes();
>> + let len = bytes.len();
>> + if len > self.len {
>> + return Err(fmt::Error);
>> + }
>> + unsafe { core::ptr::copy_nonoverlapping(&bytes[0], self.ptr, len) };
>> + self.ptr = unsafe { self.ptr.add(len) };
>> + self.len -= len;
>> + Ok(())
>> + }
>> +}
>> diff --git a/rust/kernel/block/mq/request.rs b/rust/kernel/block/mq/request.rs
>> new file mode 100644
>> index 000000000000..e95ae3fd71ad
>> --- /dev/null
>> +++ b/rust/kernel/block/mq/request.rs
>> @@ -0,0 +1,71 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +//! This module provides a wrapper for the C `struct request` type.
>> +//!
>> +//! C header: [`include/linux/blk-mq.h`](../../include/linux/blk-mq.h)
>> +
>> +use crate::{
>> + bindings,
>> + block::mq::Operations,
>> + error::{Error, Result},
>> +};
>> +use core::marker::PhantomData;
>> +
>> +/// A wrapper around a blk-mq `struct request`. This represents an IO request.
>> +pub struct Request<T: Operations> {
>> + ptr: *mut bindings::request,
>
> Why is this not embedded?

I have changed it to `struct Request(Opaque<bindings::request>)` for
next version 👍

>
>> + _p: PhantomData<T>,
>> +}
>> +
>> +impl<T: Operations> Request<T> {
>> + pub(crate) unsafe fn from_ptr(ptr: *mut bindings::request) -> Self {
>> + Self {
>> + ptr,
>> + _p: PhantomData,
>> + }
>> + }
>> +
>> + /// Get the command identifier for the request
>> + pub fn command(&self) -> u32 {
>> + unsafe { (*self.ptr).cmd_flags & ((1 << bindings::REQ_OP_BITS) - 1) }
>> + }
>> +
>> + /// Call this to indicate to the kernel that the request has been issued by the driver
>> + pub fn start(&self) {
>> + unsafe { bindings::blk_mq_start_request(self.ptr) };
>> + }
>> +
>> + /// Call this to indicate to the kernel that the request has been completed without errors
>> + // TODO: Consume rq so that we can't use it after ending it?
>> + pub fn end_ok(&self) {
>> + unsafe { bindings::blk_mq_end_request(self.ptr, bindings::BLK_STS_OK as _) };
>> + }
>> +
>> + /// Call this to indicate to the kernel that the request completed with an error
>> + pub fn end_err(&self, err: Error) {
>> + unsafe { bindings::blk_mq_end_request(self.ptr, err.to_blk_status()) };
>> + }
>> +
>> + /// Call this to indicate that the request completed with the status indicated by `status`
>> + pub fn end(&self, status: Result) {
>> + if let Err(e) = status {
>> + self.end_err(e);
>> + } else {
>> + self.end_ok();
>> + }
>> + }
>> +
>> + /// Call this to schedule defered completion of the request
>> + // TODO: Consume rq so that we can't use it after completing it?
>> + pub fn complete(&self) {
>> + if !unsafe { bindings::blk_mq_complete_request_remote(self.ptr) } {
>> + T::complete(&unsafe { Self::from_ptr(self.ptr) });
>> + }
>> + }
>> +
>> + /// Get the target sector for the request
>> + #[inline(always)]
>
> Why is this `inline(always)`?

Compiler would not inline from kernel crate to modules without this. I
will check if this is still the case.

>
>> + pub fn sector(&self) -> usize {
>> + unsafe { (*self.ptr).__sector as usize }
>> + }
>> +}
>> diff --git a/rust/kernel/block/mq/tag_set.rs b/rust/kernel/block/mq/tag_set.rs
>> new file mode 100644
>> index 000000000000..d122db7f6d0e
>> --- /dev/null
>> +++ b/rust/kernel/block/mq/tag_set.rs
>> @@ -0,0 +1,92 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +//! This module provides the `TagSet` struct to wrap the C `struct blk_mq_tag_set`.
>> +//!
>> +//! C header: [`include/linux/blk-mq.h`](../../include/linux/blk-mq.h)
>> +
>> +use crate::{
>> + bindings,
>> + block::mq::{operations::OperationsVtable, Operations},
>> + error::{Error, Result},
>> + sync::Arc,
>> + types::ForeignOwnable,
>> +};
>> +use core::{cell::UnsafeCell, convert::TryInto, marker::PhantomData};
>> +
>> +/// A wrapper for the C `struct blk_mq_tag_set`
>> +pub struct TagSet<T: Operations> {
>> + inner: UnsafeCell<bindings::blk_mq_tag_set>,
>> + _p: PhantomData<T>,
>> +}
>> +
>> +impl<T: Operations> TagSet<T> {
>> + /// Try to create a new tag set
>> + pub fn try_new(
>> + nr_hw_queues: u32,
>> + tagset_data: T::TagSetData,
>> + num_tags: u32,
>> + num_maps: u32,
>> + ) -> Result<Arc<Self>> {
>
> Why force the users to use `Arc`?

Changed to return a `PinInit<TagSet>` for next version.

>
>> + let tagset = Arc::try_new(Self {
>> + inner: UnsafeCell::new(bindings::blk_mq_tag_set::default()),
>> + _p: PhantomData,
>> + })?;
>> +
>> + // SAFETY: We just allocated `tagset`, we know this is the only reference to it.
>> + let inner = unsafe { &mut *tagset.inner.get() };
>> +
>> + inner.ops = unsafe { OperationsVtable::<T>::build() };
>> + inner.nr_hw_queues = nr_hw_queues;
>> + inner.timeout = 0; // 0 means default which is 30 * HZ in C
>> + inner.numa_node = bindings::NUMA_NO_NODE;
>> + inner.queue_depth = num_tags;
>> + inner.cmd_size = core::mem::size_of::<T::RequestData>().try_into()?;
>> + inner.flags = bindings::BLK_MQ_F_SHOULD_MERGE;
>> + inner.driver_data = tagset_data.into_foreign() as _;
>> + inner.nr_maps = num_maps;
>> +
>> + // SAFETY: `inner` points to valid and initialised memory.
>> + let ret = unsafe { bindings::blk_mq_alloc_tag_set(inner) };
>> + if ret < 0 {
>> + // SAFETY: We created `driver_data` above with `into_foreign`
>> + unsafe { T::TagSetData::from_foreign(inner.driver_data) };
>> + return Err(Error::from_errno(ret));
>> + }
>> +
>> + Ok(tagset)
>> + }
>> +
>> + /// Return the pointer to the wrapped `struct blk_mq_tag_set`
>> + pub(crate) fn raw_tag_set(&self) -> *mut bindings::blk_mq_tag_set {
>> + self.inner.get()
>> + }
>> +}
>> +
>> +impl<T: Operations> Drop for TagSet<T> {
>> + fn drop(&mut self) {
>> + let tagset_data = unsafe { (*self.inner.get()).driver_data };
>> +
>> + // SAFETY: `inner` is valid and has been properly initialised during construction.
>> + unsafe { bindings::blk_mq_free_tag_set(self.inner.get()) };
>> +
>> + // SAFETY: `tagset_data` was created by a call to
>> + // `ForeignOwnable::into_foreign` in `TagSet::try_new()`
>> + unsafe { T::TagSetData::from_foreign(tagset_data) };
>> + }
>> +}
>> +
>> +/// A tag set reference. Used to control lifetime and prevent drop of TagSet references passed to
>> +/// `Operations::map_queues()`
>> +pub struct TagSetRef {
>> + ptr: *mut bindings::blk_mq_tag_set,
>> +}
>> +
>> +impl TagSetRef {
>> + pub(crate) unsafe fn from_ptr(tagset: *mut bindings::blk_mq_tag_set) -> Self {
>> + Self { ptr: tagset }
>> + }
>> +
>> + pub fn ptr(&self) -> *mut bindings::blk_mq_tag_set {
>> + self.ptr
>> + }
>> +}
>
> This is a **very** thin abstraction, why is it needed?

It is not. I changed it to `&TagSet`, thanks.


Thanks for the comments!

Best regards,
Andreas