Skip to content

Commit 472d9f7

Browse files
authored
Reject add sled requests for sleds that already exist (#5675)
I ran into this as a part of #5625, where adding a previously-expunged sled appeared to succeed, but didn't actually add anything new. Today if we try to add a sled that is already _running_, we get a 500, because Nexus fails when it tries to tell the sled-agent to start. But with this PR, we fail earlier: adding a sled that already has a subnet allocation fails before we even try to talk to the sled-agent, because it means someone has already added this sled: ``` root@oxz_switch:~# omdb -w nexus sleds add g2 i86pc added sled g2 (i86pc): 90413e40-8139-43b4-9081-365dab6e5579 root@oxz_switch:~# omdb -w nexus sleds add g2 i86pc Error: adding sled Caused by: Error Response: status: 400 Bad Request; headers: {"content-type": "application/json", "x-request-id": "9eb95a9f-3fe0-4f75-8846-13490b95500e", "content-length": "188", "date": "Tue, 30 Apr 2024 20:54:49 GMT"}; value: Error { error_code: Some("ObjectAlreadyExists"), message: "already exists: sled \"g2 / i86pc (90413e40-8139-43b4-9081-365dab6e5579)\"", request_id: "9eb95a9f-3fe0-4f75-8846-13490b95500e" } ``` This does change the external API slightly (204 -> 201 created, and we now return the ID), but I think (?) that's probably fine since we have no real consumers of that yet.
1 parent d901636 commit 472d9f7

File tree

14 files changed

+286
-65
lines changed

14 files changed

+286
-65
lines changed

clients/nexus-client/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ progenitor::generate_api!(
3737
NetworkInterfaceKind = omicron_common::api::internal::shared::NetworkInterfaceKind,
3838
TypedUuidForCollectionKind = omicron_uuid_kinds::CollectionUuid,
3939
TypedUuidForDownstairsKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::DownstairsKind>,
40+
TypedUuidForSledKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::SledKind>,
4041
TypedUuidForUpstairsKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsKind>,
4142
TypedUuidForUpstairsRepairKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsRepairKind>,
4243
TypedUuidForUpstairsSessionKind = omicron_uuid_kinds::TypedUuid<omicron_uuid_kinds::UpstairsSessionKind>,

dev-tools/omdb/src/bin/omdb/nexus.rs

+5-3
Original file line numberDiff line numberDiff line change
@@ -1231,14 +1231,16 @@ async fn cmd_nexus_sled_add(
12311231
args: &SledAddArgs,
12321232
_destruction_token: DestructiveOperationToken,
12331233
) -> Result<(), anyhow::Error> {
1234-
client
1234+
let sled_id = client
12351235
.sled_add(&UninitializedSledId {
12361236
part: args.part.clone(),
12371237
serial: args.serial.clone(),
12381238
})
12391239
.await
1240-
.context("adding sled")?;
1241-
eprintln!("added sled {} ({})", args.serial, args.part);
1240+
.context("adding sled")?
1241+
.into_inner()
1242+
.id;
1243+
eprintln!("added sled {} ({}): {sled_id}", args.serial, args.part);
12421244
Ok(())
12431245
}
12441246

nexus/db-model/src/sled_underlay_subnet_allocation.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,16 @@
33
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
44

55
use crate::schema::sled_underlay_subnet_allocation;
6+
use crate::typed_uuid::DbTypedUuid;
7+
use omicron_uuid_kinds::SledKind;
68
use uuid::Uuid;
79

810
/// Underlay allocation for a sled added to an initialized rack
911
#[derive(Queryable, Insertable, Debug, Clone, Selectable)]
1012
#[diesel(table_name = sled_underlay_subnet_allocation)]
1113
pub struct SledUnderlaySubnetAllocation {
1214
pub rack_id: Uuid,
13-
pub sled_id: Uuid,
15+
pub sled_id: DbTypedUuid<SledKind>,
1416
pub subnet_octet: i16,
1517
pub hw_baseboard_id: Uuid,
1618
}

nexus/db-queries/src/db/datastore/mod.rs

+1
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ pub use inventory::DataStoreInventoryTest;
107107
use nexus_db_model::AllSchemaVersions;
108108
pub use probe::ProbeInfo;
109109
pub use rack::RackInit;
110+
pub use rack::SledUnderlayAllocationResult;
110111
pub use silo::Discoverability;
111112
pub use switch_port::SwitchPortSettingsCombinedResult;
112113
pub use virtual_provisioning_collection::StorageType;

nexus/db-queries/src/db/datastore/rack.rs

+57-17
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ use omicron_common::api::external::ResourceType;
6464
use omicron_common::api::external::UpdateResult;
6565
use omicron_common::bail_unless;
6666
use omicron_uuid_kinds::GenericUuid;
67+
use omicron_uuid_kinds::SledUuid;
6768
use slog_error_chain::InlineErrorChain;
6869
use std::sync::{Arc, OnceLock};
6970
use uuid::Uuid;
@@ -172,6 +173,15 @@ impl From<RackInitError> for Error {
172173
}
173174
}
174175

176+
/// Possible results of attempting a new sled underlay allocation
177+
#[derive(Debug, Clone)]
178+
pub enum SledUnderlayAllocationResult {
179+
/// A new allocation was created
180+
New(SledUnderlaySubnetAllocation),
181+
/// A prior allocation was found
182+
Existing(SledUnderlaySubnetAllocation),
183+
}
184+
175185
impl DataStore {
176186
pub async fn rack_list(
177187
&self,
@@ -295,7 +305,7 @@ impl DataStore {
295305
opctx: &OpContext,
296306
rack_id: Uuid,
297307
hw_baseboard_id: Uuid,
298-
) -> Result<SledUnderlaySubnetAllocation, Error> {
308+
) -> Result<SledUnderlayAllocationResult, Error> {
299309
// Fetch all the existing allocations via self.rack_id
300310
let allocations = self.rack_subnet_allocations(opctx, rack_id).await?;
301311

@@ -306,17 +316,14 @@ impl DataStore {
306316
const MIN_SUBNET_OCTET: i16 = 33;
307317
let mut new_allocation = SledUnderlaySubnetAllocation {
308318
rack_id,
309-
sled_id: Uuid::new_v4(),
319+
sled_id: SledUuid::new_v4().into(),
310320
subnet_octet: MIN_SUBNET_OCTET,
311321
hw_baseboard_id,
312322
};
313-
let mut allocation_already_exists = false;
314323
for allocation in allocations {
315324
if allocation.hw_baseboard_id == new_allocation.hw_baseboard_id {
316325
// We already have an allocation for this sled.
317-
new_allocation = allocation;
318-
allocation_already_exists = true;
319-
break;
326+
return Ok(SledUnderlayAllocationResult::Existing(allocation));
320327
}
321328
if allocation.subnet_octet == new_allocation.subnet_octet {
322329
bail_unless!(
@@ -332,11 +339,8 @@ impl DataStore {
332339
// allocations when sleds are being added. We will need another
333340
// mechanism ala generation numbers when we must interleave additions
334341
// and removals of sleds.
335-
if !allocation_already_exists {
336-
self.sled_subnet_allocation_insert(opctx, &new_allocation).await?;
337-
}
338-
339-
Ok(new_allocation)
342+
self.sled_subnet_allocation_insert(opctx, &new_allocation).await?;
343+
Ok(SledUnderlayAllocationResult::New(new_allocation))
340344
}
341345

342346
/// Return all current underlay allocations for the rack.
@@ -2121,7 +2125,7 @@ mod test {
21212125
for i in 0..5i16 {
21222126
let allocation = SledUnderlaySubnetAllocation {
21232127
rack_id,
2124-
sled_id: Uuid::new_v4(),
2128+
sled_id: SledUuid::new_v4().into(),
21252129
subnet_octet: 33 + i,
21262130
hw_baseboard_id: Uuid::new_v4(),
21272131
};
@@ -2141,7 +2145,7 @@ mod test {
21412145
// sled_id. Ensure we get an error due to a unique constraint.
21422146
let mut should_fail_allocation = SledUnderlaySubnetAllocation {
21432147
rack_id,
2144-
sled_id: Uuid::new_v4(),
2148+
sled_id: SledUuid::new_v4().into(),
21452149
subnet_octet: 37,
21462150
hw_baseboard_id: Uuid::new_v4(),
21472151
};
@@ -2169,7 +2173,7 @@ mod test {
21692173
// Allocations outside our expected range fail
21702174
let mut should_fail_allocation = SledUnderlaySubnetAllocation {
21712175
rack_id,
2172-
sled_id: Uuid::new_v4(),
2176+
sled_id: SledUuid::new_v4().into(),
21732177
subnet_octet: 32,
21742178
hw_baseboard_id: Uuid::new_v4(),
21752179
};
@@ -2205,18 +2209,28 @@ mod test {
22052209

22062210
let rack_id = Uuid::new_v4();
22072211

2212+
let mut hw_baseboard_ids = vec![];
22082213
let mut allocated_octets = vec![];
22092214
for _ in 0..5 {
2215+
let hw_baseboard_id = Uuid::new_v4();
2216+
hw_baseboard_ids.push(hw_baseboard_id);
22102217
allocated_octets.push(
2211-
datastore
2218+
match datastore
22122219
.allocate_sled_underlay_subnet_octets(
22132220
&opctx,
22142221
rack_id,
2215-
Uuid::new_v4(),
2222+
hw_baseboard_id,
22162223
)
22172224
.await
22182225
.unwrap()
2219-
.subnet_octet,
2226+
{
2227+
SledUnderlayAllocationResult::New(allocation) => {
2228+
allocation.subnet_octet
2229+
}
2230+
SledUnderlayAllocationResult::Existing(allocation) => {
2231+
panic!("unexpected allocation {allocation:?}");
2232+
}
2233+
},
22202234
);
22212235
}
22222236

@@ -2232,6 +2246,32 @@ mod test {
22322246
allocations.iter().map(|a| a.subnet_octet).collect::<Vec<_>>()
22332247
);
22342248

2249+
// If we attempt to insert the same baseboards again, we should get the
2250+
// existing allocations back.
2251+
for (hw_baseboard_id, expected_octet) in
2252+
hw_baseboard_ids.into_iter().zip(expected)
2253+
{
2254+
match datastore
2255+
.allocate_sled_underlay_subnet_octets(
2256+
&opctx,
2257+
rack_id,
2258+
hw_baseboard_id,
2259+
)
2260+
.await
2261+
.unwrap()
2262+
{
2263+
SledUnderlayAllocationResult::New(allocation) => {
2264+
panic!("unexpected allocation {allocation:?}");
2265+
}
2266+
SledUnderlayAllocationResult::Existing(allocation) => {
2267+
assert_eq!(
2268+
allocation.subnet_octet, expected_octet,
2269+
"unexpected octet for {allocation:?}"
2270+
);
2271+
}
2272+
}
2273+
}
2274+
22352275
db.cleanup().await.unwrap();
22362276
logctx.cleanup_successful();
22372277
}

nexus/src/app/rack.rs

+21-5
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ use nexus_db_queries::context::OpContext;
1818
use nexus_db_queries::db;
1919
use nexus_db_queries::db::datastore::DnsVersionUpdateBuilder;
2020
use nexus_db_queries::db::datastore::RackInit;
21+
use nexus_db_queries::db::datastore::SledUnderlayAllocationResult;
2122
use nexus_db_queries::db::lookup::LookupPath;
2223
use nexus_reconfigurator_execution::silo_dns_name;
2324
use nexus_types::deployment::blueprint_zone_type;
@@ -56,7 +57,10 @@ use omicron_common::api::external::ListResultVec;
5657
use omicron_common::api::external::LookupResult;
5758
use omicron_common::api::external::Name;
5859
use omicron_common::api::external::NameOrId;
60+
use omicron_common::api::external::ResourceType;
5961
use omicron_common::api::internal::shared::ExternalPortDiscovery;
62+
use omicron_uuid_kinds::GenericUuid;
63+
use omicron_uuid_kinds::SledUuid;
6064
use sled_agent_client::types::AddSledRequest;
6165
use sled_agent_client::types::StartSledAgentRequest;
6266
use sled_agent_client::types::StartSledAgentRequestBody;
@@ -776,7 +780,7 @@ impl super::Nexus {
776780
&self,
777781
opctx: &OpContext,
778782
sled: UninitializedSledId,
779-
) -> Result<(), Error> {
783+
) -> Result<SledUuid, Error> {
780784
let baseboard_id = sled.clone().into();
781785
let hw_baseboard_id = self
782786
.db_datastore
@@ -787,14 +791,26 @@ impl super::Nexus {
787791
let rack_subnet =
788792
Ipv6Subnet::<RACK_PREFIX>::from(rack_subnet(Some(subnet))?);
789793

790-
let allocation = self
794+
let allocation = match self
791795
.db_datastore
792796
.allocate_sled_underlay_subnet_octets(
793797
opctx,
794798
self.rack_id,
795799
hw_baseboard_id,
796800
)
797-
.await?;
801+
.await?
802+
{
803+
SledUnderlayAllocationResult::New(allocation) => allocation,
804+
SledUnderlayAllocationResult::Existing(allocation) => {
805+
return Err(Error::ObjectAlreadyExists {
806+
type_name: ResourceType::Sled,
807+
object_name: format!(
808+
"{} / {} ({})",
809+
sled.serial, sled.part, allocation.sled_id
810+
),
811+
});
812+
}
813+
};
798814

799815
// Convert `UninitializedSledId` to the sled-agent type
800816
let baseboard_id = sled_agent_client::types::BaseboardId {
@@ -809,7 +825,7 @@ impl super::Nexus {
809825
generation: 0,
810826
schema_version: 1,
811827
body: StartSledAgentRequestBody {
812-
id: allocation.sled_id,
828+
id: allocation.sled_id.into_untyped_uuid(),
813829
rack_id: allocation.rack_id,
814830
use_trust_quorum: true,
815831
is_lrtq_learner: true,
@@ -852,7 +868,7 @@ impl super::Nexus {
852868
),
853869
})?;
854870

855-
Ok(())
871+
Ok(allocation.sled_id.into())
856872
}
857873

858874
async fn get_any_sled_agent_url(

nexus/src/external_api/http_entrypoints.rs

+11-4
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ use omicron_common::api::external::{
8383
http_pagination::data_page_params_for, AggregateBgpMessageHistory,
8484
};
8585
use omicron_common::bail_unless;
86+
use omicron_uuid_kinds::SledUuid;
8687
use parse_display::Display;
8788
use propolis_client::support::tungstenite::protocol::frame::coding::CloseCode;
8889
use propolis_client::support::tungstenite::protocol::{
@@ -5210,6 +5211,12 @@ async fn sled_list_uninitialized(
52105211
apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await
52115212
}
52125213

5214+
/// The unique ID of a sled.
5215+
#[derive(Clone, Debug, Serialize, JsonSchema)]
5216+
pub struct SledId {
5217+
pub id: SledUuid,
5218+
}
5219+
52135220
/// Add sled to initialized rack
52145221
//
52155222
// TODO: In the future this should really be a PUT request, once we resolve
@@ -5218,19 +5225,19 @@ async fn sled_list_uninitialized(
52185225
// we are only operating on single rack systems.
52195226
#[endpoint {
52205227
method = POST,
5221-
path = "/v1/system/hardware/sleds/",
5228+
path = "/v1/system/hardware/sleds",
52225229
tags = ["system/hardware"]
52235230
}]
52245231
async fn sled_add(
52255232
rqctx: RequestContext<Arc<ServerContext>>,
52265233
sled: TypedBody<params::UninitializedSledId>,
5227-
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
5234+
) -> Result<HttpResponseCreated<SledId>, HttpError> {
52285235
let apictx = rqctx.context();
52295236
let nexus = &apictx.nexus;
52305237
let handler = async {
52315238
let opctx = crate::context::op_context_for_external_api(&rqctx).await?;
5232-
nexus.sled_add(&opctx, sled.into_inner()).await?;
5233-
Ok(HttpResponseUpdatedNoContent())
5239+
let id = nexus.sled_add(&opctx, sled.into_inner()).await?;
5240+
Ok(HttpResponseCreated(SledId { id }))
52345241
};
52355242
apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await
52365243
}

nexus/src/internal_api/http_entrypoints.rs

+5-5
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44

55
//! Handler functions (entrypoints) for HTTP APIs internal to the control plane
66
7-
use crate::ServerContext;
8-
97
use super::params::{OximeterInfo, RackInitializationRequest};
8+
use crate::external_api::http_entrypoints::SledId;
9+
use crate::ServerContext;
1010
use dropshot::endpoint;
1111
use dropshot::ApiDescription;
1212
use dropshot::FreeformBody;
@@ -1043,13 +1043,13 @@ async fn sled_list_uninitialized(
10431043
async fn sled_add(
10441044
rqctx: RequestContext<Arc<ServerContext>>,
10451045
sled: TypedBody<UninitializedSledId>,
1046-
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
1046+
) -> Result<HttpResponseCreated<SledId>, HttpError> {
10471047
let apictx = rqctx.context();
10481048
let nexus = &apictx.nexus;
10491049
let handler = async {
10501050
let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
1051-
nexus.sled_add(&opctx, sled.into_inner()).await?;
1052-
Ok(HttpResponseUpdatedNoContent())
1051+
let id = nexus.sled_add(&opctx, sled.into_inner()).await?;
1052+
Ok(HttpResponseCreated(SledId { id }))
10531053
};
10541054
apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
10551055
}

nexus/test-utils/src/lib.rs

+23
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ use omicron_uuid_kinds::ZpoolUuid;
6161
use oximeter_collector::Oximeter;
6262
use oximeter_producer::LogConfig;
6363
use oximeter_producer::Server as ProducerServer;
64+
use sled_agent_client::types::EarlyNetworkConfig;
65+
use sled_agent_client::types::EarlyNetworkConfigBody;
66+
use sled_agent_client::types::RackNetworkConfigV1;
6467
use slog::{debug, error, o, Logger};
6568
use std::collections::BTreeMap;
6669
use std::collections::HashMap;
@@ -911,6 +914,26 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> {
911914
})
912915
.await
913916
.expect("Failed to configure sled agent with our zones");
917+
client
918+
.write_network_bootstore_config(&EarlyNetworkConfig {
919+
body: EarlyNetworkConfigBody {
920+
ntp_servers: Vec::new(),
921+
rack_network_config: Some(RackNetworkConfigV1 {
922+
bfd: Vec::new(),
923+
bgp: Vec::new(),
924+
infra_ip_first: "192.0.2.10".parse().unwrap(),
925+
infra_ip_last: "192.0.2.100".parse().unwrap(),
926+
ports: Vec::new(),
927+
rack_subnet: "fd00:1122:3344:0100::/56"
928+
.parse()
929+
.unwrap(),
930+
}),
931+
},
932+
generation: 1,
933+
schema_version: 1,
934+
})
935+
.await
936+
.expect("Failed to write early networking config to bootstore");
914937
}
915938

916939
// Set up the Crucible Pantry on an existing Sled Agent.

0 commit comments

Comments
 (0)