Skip to content

Commit 50ebce8

Browse files
committed
Merge 'Purge old ip on change' from Petr Gusev
When a node changes IP address we need to remove its old IP from `system.peers` and gossiper. We do this in `sync_raft_topology_nodes` when the new IP is saved into `system.peers` to avoid losing the mapping if the node crashes between deleting and saving the new IP. We also handle the possible duplicates in this case by dropping them on the read path when the node is restarted. The PR also fixes the problem with old IPs getting resurrected when a node changes its IP address. The following scenario is possible: a node `A` changes its IP from `ip1` to `ip2` with restart, other nodes are not yet aware of `ip2` so they keep gossiping `ip1`. After restart `A` receives `ip1` in a gossip message and calls `handle_major_state_change` since it considers it as a new node. Then `on_join` event is called on the gossiper notification handlers, we receive such event in `raft_ip_address_updater` and reverts the IP of the node A back to ip1. To fix this we ensure that the new gossiper generation number is used when a node registers its IP address in `raft_address_map` at startup. The `test_change_ip` is adjusted to ensure that the old IPs are properly removed in all cases, even if the node crashes. Fixes scylladb#16886 Fixes scylladb#16691 Fixes scylladb#17199 Closes scylladb#17162 * github.com:scylladb/scylladb: test_change_ip: improve the test raft_ip_address_updater: remove stale IPs from gossiper raft_address_map: add my ip with the new generation system_keyspace::update_peer_info: check ep and host_id are not empty system_keyspace::update_peer_info: make host_id an explicit parameter system_keyspace::update_peer_info: remove any_set flag optimisation system_keyspace: remove duplicate ips for host_id system_keyspace: peers table: use coroutines storage_service::raft_ip_address_updater: log gossiper event name raft topology: ip change: purge old IP on_endpoint_change: coroutinize the lambda around sync_raft_topology_nodes
2 parents 22a5112 + c414067 commit 50ebce8

9 files changed

+341
-201
lines changed

db/system_keyspace.cc

+113-90
Original file line numberDiff line numberDiff line change
@@ -1464,20 +1464,58 @@ struct local_cache {
14641464
system_keyspace::bootstrap_state _state;
14651465
};
14661466

1467+
future<> system_keyspace::peers_table_read_fixup() {
1468+
assert(this_shard_id() == 0);
1469+
if (_peers_table_read_fixup_done) {
1470+
co_return;
1471+
}
1472+
_peers_table_read_fixup_done = true;
1473+
1474+
const auto cql = format("SELECT peer, host_id, WRITETIME(host_id) as ts from system.{}", PEERS);
1475+
std::unordered_map<utils::UUID, std::pair<net::inet_address, int64_t>> map{};
1476+
const auto cql_result = co_await execute_cql(cql);
1477+
for (const auto& row : *cql_result) {
1478+
const auto peer = row.get_as<net::inet_address>("peer");
1479+
if (!row.has("host_id")) {
1480+
slogger.error("Peer {} has no host_id in system.{}, the record is broken, removing it",
1481+
peer, system_keyspace::PEERS);
1482+
co_await remove_endpoint(gms::inet_address{peer});
1483+
continue;
1484+
}
1485+
const auto host_id = row.get_as<utils::UUID>("host_id");
1486+
const auto ts = row.get_as<int64_t>("ts");
1487+
const auto it = map.find(host_id);
1488+
if (it == map.end()) {
1489+
map.insert({host_id, {peer, ts}});
1490+
continue;
1491+
}
1492+
if (it->second.second >= ts) {
1493+
slogger.error("Peer {} with host_id {} has newer IP {} in system.{}, the record is stale, removing it",
1494+
peer, host_id, it->second.first, system_keyspace::PEERS);
1495+
co_await remove_endpoint(gms::inet_address{peer});
1496+
} else {
1497+
slogger.error("Peer {} with host_id {} has newer IP {} in system.{}, the record is stale, removing it",
1498+
it->second.first, host_id, peer, system_keyspace::PEERS);
1499+
co_await remove_endpoint(gms::inet_address{it->second.first});
1500+
it->second = {peer, ts};
1501+
}
1502+
}
1503+
}
1504+
14671505
future<std::unordered_map<gms::inet_address, locator::endpoint_dc_rack>> system_keyspace::load_dc_rack_info() {
1468-
auto msg = co_await execute_cql(format("SELECT peer, data_center, rack from system.{}", PEERS));
1506+
co_await peers_table_read_fixup();
1507+
1508+
const auto msg = co_await execute_cql(format("SELECT peer, data_center, rack from system.{}", PEERS));
14691509

14701510
std::unordered_map<gms::inet_address, locator::endpoint_dc_rack> ret;
14711511
for (const auto& row : *msg) {
1472-
net::inet_address peer = row.template get_as<net::inet_address>("peer");
14731512
if (!row.has("data_center") || !row.has("rack")) {
14741513
continue;
14751514
}
1476-
gms::inet_address gms_addr(std::move(peer));
1477-
sstring dc = row.template get_as<sstring>("data_center");
1478-
sstring rack = row.template get_as<sstring>("rack");
1479-
1480-
ret.emplace(gms_addr, locator::endpoint_dc_rack{ dc, rack });
1515+
ret.emplace(row.get_as<net::inet_address>("peer"), locator::endpoint_dc_rack {
1516+
row.get_as<sstring>("data_center"),
1517+
row.get_as<sstring>("rack")
1518+
});
14811519
}
14821520

14831521
co_return ret;
@@ -1735,114 +1773,98 @@ static std::vector<cdc::generation_id_v2> decode_cdc_generations_ids(const set_t
17351773
return gen_ids_list;
17361774
}
17371775

1738-
static locator::host_id get_host_id(const gms::inet_address& peer, const cql3::untyped_result_set::row& row) {
1739-
locator::host_id host_id;
1740-
if (row.has("host_id")) {
1741-
host_id = locator::host_id(row.get_as<utils::UUID>("host_id"));
1742-
}
1743-
if (!host_id) {
1744-
slogger.warn("Peer {} has no host_id in system.{}", peer, system_keyspace::PEERS);
1745-
}
1746-
return host_id;
1747-
}
1748-
17491776
future<std::unordered_map<gms::inet_address, std::unordered_set<dht::token>>> system_keyspace::load_tokens() {
1750-
sstring req = format("SELECT peer, host_id, tokens FROM system.{}", PEERS);
1751-
return execute_cql(req).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) {
1752-
std::unordered_map<gms::inet_address, std::unordered_set<dht::token>> ret;
1753-
for (auto& row : *cql_result) {
1754-
auto peer = gms::inet_address(row.get_as<net::inet_address>("peer"));
1755-
if (!get_host_id(peer, row)) {
1756-
continue;
1757-
}
1758-
if (row.has("tokens")) {
1759-
ret.emplace(peer, decode_tokens(deserialize_set_column(*peers(), row, "tokens")));
1760-
}
1777+
co_await peers_table_read_fixup();
1778+
1779+
const sstring req = format("SELECT peer, tokens FROM system.{}", PEERS);
1780+
std::unordered_map<gms::inet_address, std::unordered_set<dht::token>> ret;
1781+
const auto cql_result = co_await execute_cql(req);
1782+
for (const auto& row : *cql_result) {
1783+
if (row.has("tokens")) {
1784+
ret.emplace(gms::inet_address(row.get_as<net::inet_address>("peer")),
1785+
decode_tokens(deserialize_set_column(*peers(), row, "tokens")));
17611786
}
1762-
return ret;
1763-
});
1787+
}
1788+
co_return ret;
17641789
}
17651790

17661791
future<std::unordered_map<gms::inet_address, locator::host_id>> system_keyspace::load_host_ids() {
1767-
sstring req = format("SELECT peer, host_id FROM system.{}", PEERS);
1768-
return execute_cql(req).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) {
1769-
std::unordered_map<gms::inet_address, locator::host_id> ret;
1770-
for (auto& row : *cql_result) {
1771-
auto peer = gms::inet_address(row.get_as<net::inet_address>("peer"));
1772-
if (auto host_id = get_host_id(peer, row)) {
1773-
ret.emplace(peer, host_id);
1774-
}
1775-
}
1776-
return ret;
1777-
});
1792+
co_await peers_table_read_fixup();
1793+
1794+
const sstring req = format("SELECT peer, host_id FROM system.{}", PEERS);
1795+
std::unordered_map<gms::inet_address, locator::host_id> ret;
1796+
const auto cql_result = co_await execute_cql(req);
1797+
for (const auto& row : *cql_result) {
1798+
ret.emplace(gms::inet_address(row.get_as<net::inet_address>("peer")),
1799+
locator::host_id(row.get_as<utils::UUID>("host_id")));
1800+
}
1801+
co_return ret;
17781802
}
17791803

17801804
future<std::vector<gms::inet_address>> system_keyspace::load_peers() {
1781-
auto res = co_await execute_cql(format("SELECT peer, host_id, tokens FROM system.{}", PEERS));
1805+
co_await peers_table_read_fixup();
1806+
1807+
const auto res = co_await execute_cql(format("SELECT peer, tokens FROM system.{}", PEERS));
17821808
assert(res);
17831809

17841810
std::vector<gms::inet_address> ret;
1785-
for (auto& row: *res) {
1786-
auto peer = gms::inet_address(row.get_as<net::inet_address>("peer"));
1787-
if (!get_host_id(peer, row)) {
1788-
continue;
1789-
}
1811+
for (const auto& row: *res) {
17901812
if (!row.has("tokens")) {
17911813
// Ignore rows that don't have tokens. Such rows may
17921814
// be introduced by code that persists parts of peer
17931815
// information (such as RAFT_ID) which may potentially
17941816
// race with deleting a peer (during node removal).
17951817
continue;
17961818
}
1797-
ret.emplace_back(peer);
1819+
ret.emplace_back(gms::inet_address(row.get_as<net::inet_address>("peer")));
17981820
}
17991821
co_return ret;
18001822
}
18011823

18021824
future<std::unordered_map<gms::inet_address, sstring>> system_keyspace::load_peer_features() {
1803-
sstring req = format("SELECT peer, supported_features FROM system.{}", PEERS);
1804-
return execute_cql(req).then([] (::shared_ptr<cql3::untyped_result_set> cql_result) {
1805-
std::unordered_map<gms::inet_address, sstring> ret;
1806-
for (auto& row : *cql_result) {
1807-
if (row.has("supported_features")) {
1808-
ret.emplace(row.get_as<net::inet_address>("peer"),
1809-
row.get_as<sstring>("supported_features"));
1810-
}
1825+
co_await peers_table_read_fixup();
1826+
1827+
const sstring req = format("SELECT peer, supported_features FROM system.{}", PEERS);
1828+
std::unordered_map<gms::inet_address, sstring> ret;
1829+
const auto cql_result = co_await execute_cql(req);
1830+
for (const auto& row : *cql_result) {
1831+
if (row.has("supported_features")) {
1832+
ret.emplace(row.get_as<net::inet_address>("peer"),
1833+
row.get_as<sstring>("supported_features"));
18111834
}
1812-
return ret;
1813-
});
1835+
}
1836+
co_return ret;
18141837
}
18151838

18161839
future<std::unordered_map<gms::inet_address, gms::inet_address>> system_keyspace::get_preferred_ips() {
1817-
sstring req = format("SELECT peer, preferred_ip FROM system.{}", PEERS);
1818-
return execute_cql(req).then([] (::shared_ptr<cql3::untyped_result_set> cql_res_set) {
1819-
std::unordered_map<gms::inet_address, gms::inet_address> res;
1820-
1821-
for (auto& r : *cql_res_set) {
1822-
if (r.has("preferred_ip")) {
1823-
res.emplace(gms::inet_address(r.get_as<net::inet_address>("peer")),
1824-
gms::inet_address(r.get_as<net::inet_address>("preferred_ip")));
1825-
}
1840+
co_await peers_table_read_fixup();
1841+
1842+
const sstring req = format("SELECT peer, preferred_ip FROM system.{}", PEERS);
1843+
std::unordered_map<gms::inet_address, gms::inet_address> res;
1844+
1845+
const auto cql_result = co_await execute_cql(req);
1846+
for (const auto& r : *cql_result) {
1847+
if (r.has("preferred_ip")) {
1848+
res.emplace(gms::inet_address(r.get_as<net::inet_address>("peer")),
1849+
gms::inet_address(r.get_as<net::inet_address>("preferred_ip")));
18261850
}
1851+
}
18271852

1828-
return res;
1829-
});
1853+
co_return res;
18301854
}
18311855

18321856
namespace {
18331857
template <typename T>
1834-
static data_value_or_unset make_data_value_or_unset(const std::optional<T>& opt, bool& any_set) {
1858+
static data_value_or_unset make_data_value_or_unset(const std::optional<T>& opt) {
18351859
if (opt) {
1836-
any_set = true;
18371860
return data_value(*opt);
18381861
} else {
18391862
return unset_value{};
18401863
}
18411864
};
18421865

1843-
static data_value_or_unset make_data_value_or_unset(const std::optional<std::unordered_set<dht::token>>& opt, bool& any_set) {
1866+
static data_value_or_unset make_data_value_or_unset(const std::optional<std::unordered_set<dht::token>>& opt) {
18441867
if (opt) {
1845-
any_set = true;
18461868
auto set_type = set_type_impl::get_instance(utf8_type, true);
18471869
return make_set_value(set_type, prepare_tokens(*opt));
18481870
} else {
@@ -1851,29 +1873,30 @@ static data_value_or_unset make_data_value_or_unset(const std::optional<std::uno
18511873
};
18521874
}
18531875

1854-
future<> system_keyspace::update_peer_info(gms::inet_address ep, const peer_info& info) {
1876+
future<> system_keyspace::update_peer_info(gms::inet_address ep, locator::host_id hid, const peer_info& info) {
1877+
if (ep == gms::inet_address{}) {
1878+
on_internal_error(slogger, format("update_peer_info called with empty inet_address, host_id {}", hid));
1879+
}
1880+
if (!hid) {
1881+
on_internal_error(slogger, format("update_peer_info called with empty host_id, ep {}", ep));
1882+
}
18551883
if (_db.get_token_metadata().get_topology().is_me(ep)) {
18561884
on_internal_error(slogger, format("update_peer_info called for this node: {}", ep));
18571885
}
18581886

1859-
bool any_set = false;
18601887
data_value_list values = {
18611888
data_value_or_unset(data_value(ep.addr())),
1862-
make_data_value_or_unset(info.data_center, any_set),
1863-
make_data_value_or_unset(info.host_id, any_set),
1864-
make_data_value_or_unset(info.preferred_ip, any_set),
1865-
make_data_value_or_unset(info.rack, any_set),
1866-
make_data_value_or_unset(info.release_version, any_set),
1867-
make_data_value_or_unset(info.rpc_address, any_set),
1868-
make_data_value_or_unset(info.schema_version, any_set),
1869-
make_data_value_or_unset(info.tokens, any_set),
1870-
make_data_value_or_unset(info.supported_features, any_set),
1889+
make_data_value_or_unset(info.data_center),
1890+
data_value_or_unset(hid.id),
1891+
make_data_value_or_unset(info.preferred_ip),
1892+
make_data_value_or_unset(info.rack),
1893+
make_data_value_or_unset(info.release_version),
1894+
make_data_value_or_unset(info.rpc_address),
1895+
make_data_value_or_unset(info.schema_version),
1896+
make_data_value_or_unset(info.tokens),
1897+
make_data_value_or_unset(info.supported_features),
18711898
};
18721899

1873-
if (!any_set) {
1874-
co_return;
1875-
}
1876-
18771900
auto query = fmt::format("INSERT INTO system.{} "
18781901
"(peer,data_center,host_id,preferred_ip,rack,release_version,rpc_address,schema_version,tokens,supported_features) VALUES"
18791902
"(?,?,?,?,?,?,?,?,?,?)", PEERS);
@@ -1928,7 +1951,7 @@ future<> system_keyspace::update_schema_version(table_schema_version version) {
19281951
* Remove stored tokens being used by another node
19291952
*/
19301953
future<> system_keyspace::remove_endpoint(gms::inet_address ep) {
1931-
sstring req = format("DELETE FROM system.{} WHERE peer = ?", PEERS);
1954+
const sstring req = format("DELETE FROM system.{} WHERE peer = ?", PEERS);
19321955
slogger.debug("DELETE FROM system.{} WHERE peer = {}", PEERS, ep);
19331956
co_await execute_cql(req, ep.addr()).discard_result();
19341957
}

db/system_keyspace.hh

+11-2
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ class system_keyspace : public seastar::peering_sharded_service<system_keyspace>
114114
replica::database& _db;
115115
std::unique_ptr<local_cache> _cache;
116116
virtual_tables_registry _virtual_tables_registry;
117+
bool _peers_table_read_fixup_done = false;
117118

118119
static schema_ptr raft_snapshot_config();
119120
static schema_ptr local();
@@ -128,6 +129,15 @@ class system_keyspace : public seastar::peering_sharded_service<system_keyspace>
128129
static schema_ptr large_cells();
129130
static schema_ptr scylla_local();
130131
future<> force_blocking_flush(sstring cfname);
132+
// This function is called when the system.peers table is read,
133+
// and it fixes some types of inconsistencies that can occur
134+
// due to node crashes:
135+
// * missing host_id. This is possible in the old versions of the code. Such records
136+
// are removed and the warning is written to the log.
137+
// * duplicate IPs for a given host_id. This is possible when some node changes its IP
138+
// and this node crashes after adding a new IP but before removing the old one. The
139+
// record with older timestamp is removed, the warning is written to the log.
140+
future<> peers_table_read_fixup();
131141
public:
132142
static schema_ptr size_estimates();
133143
public:
@@ -265,7 +275,6 @@ public:
265275
public:
266276
struct peer_info {
267277
std::optional<sstring> data_center;
268-
std::optional<utils::UUID> host_id;
269278
std::optional<net::inet_address> preferred_ip;
270279
std::optional<sstring> rack;
271280
std::optional<sstring> release_version;
@@ -275,7 +284,7 @@ public:
275284
std::optional<sstring> supported_features;
276285
};
277286

278-
future<> update_peer_info(gms::inet_address ep, const peer_info& info);
287+
future<> update_peer_info(gms::inet_address ep, locator::host_id hid, const peer_info& info);
279288

280289
future<> remove_endpoint(gms::inet_address ep);
281290

main.cc

+4-2
Original file line numberDiff line numberDiff line change
@@ -1792,8 +1792,10 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
17921792
// Set up group0 service earlier since it is needed by group0 setup just below
17931793
ss.local().set_group0(group0_service, raft_topology_change_enabled);
17941794

1795+
const auto generation_number = gms::generation_type(sys_ks.local().increment_and_get_generation().get());
1796+
17951797
// Load address_map from system.peers and subscribe to gossiper events to keep it updated.
1796-
ss.local().init_address_map(raft_address_map.local()).get();
1798+
ss.local().init_address_map(raft_address_map.local(), generation_number).get();
17971799
auto cancel_address_map_subscription = defer_verbose_shutdown("storage service uninit address map", [&ss] {
17981800
ss.local().uninit_address_map().get();
17991801
});
@@ -1814,7 +1816,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
18141816
}).get();
18151817

18161818
with_scheduling_group(maintenance_scheduling_group, [&] {
1817-
return ss.local().join_cluster(sys_dist_ks, proxy, gossiper, service::start_hint_manager::yes);
1819+
return ss.local().join_cluster(sys_dist_ks, proxy, gossiper, service::start_hint_manager::yes, generation_number);
18181820
}).get();
18191821

18201822
sl_controller.invoke_on_all([&lifecycle_notifier] (qos::service_level_controller& controller) {

service/raft/raft_group0.cc

-25
Original file line numberDiff line numberDiff line change
@@ -387,12 +387,6 @@ future<> raft_group0::start_server_for_group0(raft::group_id group0_id, service:
387387
// The address map may miss our own id in case we connect
388388
// to an existing Raft Group 0 leader.
389389
auto my_id = load_my_id();
390-
_raft_gr.address_map().add_or_update_entry(my_id, _gossiper.get_broadcast_address());
391-
// At this time the group registry is already up and running,
392-
// so the address map is getting all the notifications from
393-
// the gossiper. By reading the application state *after* subscribing to new gossip events,
394-
// we ensure we haven't missed any IP update in the map.
395-
load_initial_raft_address_map();
396390
group0_log.info("Server {} is starting group 0 with id {}", my_id, group0_id);
397391
auto srv_for_group0 = create_server_for_group0(group0_id, my_id, ss, qp, mm, topology_change_enabled);
398392
auto& persistence = srv_for_group0.persistence;
@@ -749,25 +743,6 @@ future<> raft_group0::setup_group0(
749743
co_await _client.set_group0_upgrade_state(group0_upgrade_state::use_post_raft_procedures);
750744
}
751745

752-
void raft_group0::load_initial_raft_address_map() {
753-
_gossiper.for_each_endpoint_state([this] (const gms::inet_address& ip_addr, const gms::endpoint_state& state) {
754-
auto* value = state.get_application_state_ptr(gms::application_state::HOST_ID);
755-
if (value == nullptr) {
756-
return;
757-
}
758-
auto server_id = utils::UUID(value->value());
759-
if (server_id == utils::UUID{}) {
760-
upgrade_log.error("empty Host ID for host {} ", ip_addr);
761-
return;
762-
}
763-
// The failure detector needs the IPs on all shards. We
764-
// can safely overwrite existing entries since are loading
765-
// them directly from gossiper app state - which is most
766-
// recent.
767-
_raft_gr.address_map().add_or_update_entry(raft::server_id{server_id}, ip_addr);
768-
});
769-
}
770-
771746
future<> raft_group0::finish_setup_after_join(service::storage_service& ss, cql3::query_processor& qp, service::migration_manager& mm, bool topology_change_enabled) {
772747
if (joined_group0()) {
773748
group0_log.info("finish_setup_after_join: group 0 ID present, loading server info.");

service/raft/raft_group0.hh

-4
Original file line numberDiff line numberDiff line change
@@ -361,10 +361,6 @@ private:
361361
// Retries on raft::commit_status_unknown.
362362
future<> make_raft_config_nonvoter(const std::unordered_set<raft::server_id>&);
363363

364-
// Load the initial Raft <-> IP address map as seen by
365-
// the gossiper.
366-
void load_initial_raft_address_map();
367-
368364
// Returns true if raft is enabled
369365
future<bool> use_raft();
370366
};

0 commit comments

Comments
 (0)