Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DRAFT] nvme_driver: fix overflow computing queue size and don't wait for bad devices forever #682

Draft
wants to merge 12 commits into
base: main
Choose a base branch
from
2 changes: 1 addition & 1 deletion openhcl/underhill_core/src/nvme_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ impl NvmeManagerWorker {
.map_err(InnerError::Vfio)?;

let driver =
nvme_driver::NvmeDriver::new(&self.driver_source, self.vp_count, device)
nvme_driver::NvmeDriver::new(&self.driver_source, self.vp_count, device, None)
.instrument(tracing::info_span!(
"nvme_driver_init",
pci_id = entry.key()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@ use nvme::NvmeController;
use nvme::NvmeControllerCaps;
use nvme_driver::Namespace;
use nvme_driver::NvmeDriver;
use nvme_driver::NVME_TIMEOUT_FUZZER;
use nvme_spec::nvm::DsmRange;
use pal_async::DefaultDriver;
use pci_core::msi::MsiInterruptSet;
use scsi_buffers::OwnedRequestBuffers;
use std::convert::TryFrom;
use std::time::Duration;
use user_driver::emulated::DeviceSharedMemory;
use vmcore::vm_task::SingleDriverBackend;
use vmcore::vm_task::VmTaskDriverSource;
Expand Down Expand Up @@ -67,7 +69,8 @@ impl FuzzNvmeDriver {
.unwrap();

let device = FuzzEmulatedDevice::new(nvme, msi_set, mem);
let nvme_driver = NvmeDriver::new(&driver_source, cpu_count, device).await?; // TODO: [use-arbitrary-input]
let timeout = Duration::from_millis(NVME_TIMEOUT_FUZZER);
let nvme_driver = NvmeDriver::new(&driver_source, cpu_count, device, Some(timeout)).await?; // TODO: [use-arbitrary-input]
let namespace = nvme_driver.namespace(1).await?; // TODO: [use-arbitrary-input]

Ok(Self {
Expand Down
34 changes: 26 additions & 8 deletions vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ use crate::NamespaceError;
use crate::NvmeDriverSavedState;
use crate::RequestError;
use crate::NVME_PAGE_SHIFT;
use crate::NVME_TIMEOUT_INCREMENT;
use crate::NVME_TIMEOUT_MINIMUM;
use anyhow::Context as _;
use futures::future::join_all;
use futures::StreamExt;
Expand All @@ -27,6 +29,7 @@ use pal_async::task::Task;
use save_restore::NvmeDriverWorkerSavedState;
use std::sync::Arc;
use std::sync::OnceLock;
use std::time::Duration;
use task_control::AsyncRun;
use task_control::InspectTask;
use task_control::TaskControl;
Expand Down Expand Up @@ -71,6 +74,8 @@ pub struct NvmeDriver<T: DeviceBacking> {
namespaces: Vec<Arc<Namespace>>,
/// Keeps the controller connected (CC.EN==1) while servicing.
nvme_keepalive: bool,
#[inspect(skip)]
reset_timeout: Duration,
}

#[derive(Inspect)]
Expand Down Expand Up @@ -165,14 +170,16 @@ enum NvmeWorkerRequest {
}

impl<T: DeviceBacking> NvmeDriver<T> {
/// Initializes the driver.
/// Initializes the driver. Only provide reset timeout for testing.
/// If no value provided, timeout from device capabilites will be used.
pub async fn new(
driver_source: &VmTaskDriverSource,
cpu_count: u32,
device: T,
reset_timeout: Option<Duration>,
) -> anyhow::Result<Self> {
let pci_id = device.id().to_owned();
let mut this = Self::new_disabled(driver_source, cpu_count, device)
let mut this = Self::new_disabled(driver_source, cpu_count, device, reset_timeout)
.instrument(tracing::info_span!("nvme_new_disabled", pci_id))
.await?;
match this
Expand All @@ -198,6 +205,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
driver_source: &VmTaskDriverSource,
cpu_count: u32,
mut device: T,
reset_timeout: Option<Duration>
) -> anyhow::Result<Self> {
let driver = driver_source.simple();
let bar0 = Bar0(
Expand All @@ -206,10 +214,15 @@ impl<T: DeviceBacking> NvmeDriver<T> {
.context("failed to map device registers")?,
);

let timeout = match reset_timeout {
Some(duration) => duration,
None => Duration::from_millis(NVME_TIMEOUT_MINIMUM.max(bar0.cap().to() as u64 * NVME_TIMEOUT_INCREMENT)),
};

let cc = bar0.cc();
if cc.en() || bar0.csts().rdy() {
if !bar0
.reset(&driver)
.reset(&driver, timeout.clone())
.instrument(tracing::info_span!(
"nvme_already_enabled",
pci_id = device.id().to_owned()
Expand Down Expand Up @@ -254,6 +267,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
rescan_event: Default::default(),
namespaces: vec![],
nvme_keepalive: false,
reset_timeout: timeout,
})
}

Expand Down Expand Up @@ -319,7 +333,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
anyhow::bail!("device is gone");
}
if csts.cfs() {
worker.registers.bar0.reset(&self.driver).await;
worker.registers.bar0.reset(&self.driver, self.reset_timeout.clone()).await;
anyhow::bail!("device had fatal error");
}
if csts.rdy() {
Expand Down Expand Up @@ -399,8 +413,8 @@ impl<T: DeviceBacking> NvmeDriver<T> {
let max_io_queues = allocated_io_queue_count.min(requested_io_queue_count);

let qsize = {
let io_cqsize = QueuePair::MAX_CQ_ENTRIES.min(worker.registers.cap.mqes_z() + 1);
let io_sqsize = QueuePair::MAX_SQ_ENTRIES.min(worker.registers.cap.mqes_z() + 1);
let io_cqsize = QueuePair::MAX_CQ_ENTRIES.min(worker.registers.cap.mqes_z().saturating_add(1));
let io_sqsize = QueuePair::MAX_SQ_ENTRIES.min(worker.registers.cap.mqes_z().saturating_add(1));

// Some hardware (such as ASAP) require that the sq and cq have the same size.
io_cqsize.min(io_sqsize)
Expand Down Expand Up @@ -454,6 +468,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {

fn reset(&mut self) -> impl 'static + Send + std::future::Future<Output = ()> {
let driver = self.driver.clone();
let timeout = self.reset_timeout.clone();
let mut task = std::mem::take(&mut self.task).unwrap();
async move {
task.stop().await;
Expand All @@ -468,7 +483,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
if let Some(admin) = worker.admin {
_admin_responses = admin.shutdown().await;
}
worker.registers.bar0.reset(&driver).await;
worker.registers.bar0.reset(&driver, timeout).await;
}
}

Expand Down Expand Up @@ -549,6 +564,8 @@ impl<T: DeviceBacking> NvmeDriver<T> {
if !bar0.csts().rdy() {
anyhow::bail!("device is gone");
}

let reset_timeout = Duration::from_millis(NVME_TIMEOUT_MINIMUM.max(bar0.cap().to() as u64 * NVME_TIMEOUT_INCREMENT));

let registers = Arc::new(DeviceRegisters::new(bar0));

Expand All @@ -557,7 +574,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
per_cpu: (0..cpu_count).map(|_| OnceLock::new()).collect(),
send,
});

let mut this = Self {
device_id: device.id().to_owned(),
task: Some(TaskControl::new(DriverWorkerTask {
Expand All @@ -579,6 +596,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
rescan_event: Default::default(),
namespaces: vec![],
nvme_keepalive: true,
reset_timeout,
};

let task = &mut this.task.as_mut().unwrap();
Expand Down
3 changes: 3 additions & 0 deletions vm/devices/storage/disk_nvme/nvme_driver/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,6 @@ pub use self::queue_pair::RequestError;
use nvme_spec as spec;

const NVME_PAGE_SHIFT: u8 = 12;
const NVME_TIMEOUT_INCREMENT: u64 = 500;
const NVME_TIMEOUT_MINIMUM: u64 = 10000;
pub const NVME_TIMEOUT_FUZZER: u64 = 10;
8 changes: 7 additions & 1 deletion vm/devices/storage/disk_nvme/nvme_driver/src/registers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ use inspect::Inspect;
use pal_async::driver::Driver;
use std::sync::atomic::AtomicBool;
use std::sync::atomic::Ordering::Relaxed;
use std::time::Duration;
use std::time::Instant;
use tracing::instrument;
use user_driver::backoff::Backoff;
use user_driver::DeviceBacking;
Expand Down Expand Up @@ -104,10 +106,11 @@ impl<T: DeviceRegisterIo + Inspect> Bar0<T> {
reg32!(aqa, set_aqa, AQA, spec::Aqa);

#[instrument(skip_all)]
pub async fn reset(&self, driver: &dyn Driver) -> bool {
pub async fn reset(&self, driver: &dyn Driver, reset_timeout: Duration) -> bool {
let cc = self.cc().with_en(false);
self.set_cc(cc);
let mut backoff = Backoff::new(driver);
let start = Instant::now();
loop {
let csts = self.csts();
if !csts.rdy() {
Expand All @@ -116,6 +119,9 @@ impl<T: DeviceRegisterIo + Inspect> Bar0<T> {
if u32::from(csts) == !0 {
break false;
}
if start.elapsed() >= reset_timeout {
break false;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The caller seems to rely on this actually resetting the device before it frees buffers. With this change, you might now return with the device still referencing these buffers. And that can lead to memory corruption.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

            // Hold onto responses until the reset completes so that waiting IOs do
            // not think the memory is unaliased by the device.

}
backoff.back_off().await;
}
}
Expand Down
4 changes: 2 additions & 2 deletions vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ async fn test_nvme_driver(driver: DefaultDriver, allow_dma: bool) {

let device = EmulatedDevice::new(nvme, msi_set, mem);

let driver = NvmeDriver::new(&driver_source, CPU_COUNT, device)
let driver = NvmeDriver::new(&driver_source, CPU_COUNT, device, None)
.await
.unwrap();

Expand Down Expand Up @@ -183,7 +183,7 @@ async fn test_nvme_save_restore_inner(driver: DefaultDriver) {
.unwrap();

let device = EmulatedDevice::new(nvme_ctrl, msi_x, mem);
let mut nvme_driver = NvmeDriver::new(&driver_source, CPU_COUNT, device)
let mut nvme_driver = NvmeDriver::new(&driver_source, CPU_COUNT, device, None)
.await
.unwrap();
let _ns1 = nvme_driver.namespace(1).await.unwrap();
Expand Down
Loading