Skip to content

Duplicate RVId handling #1803

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: feature/dcache
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion component/distributed_cache/distributed_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,14 +228,26 @@ func (dc *DistributedCache) createRVList() ([]dcache.RawVolume, error) {
if err != nil {
return nil, log.LogAndReturnError(fmt.Sprintf("DistributedCache::Start error [Failed to retrieve UUID, error: %v]", err))
}

rvList := make([]dcache.RawVolume, len(dc.cfg.CacheDirs))

// ensure each cacheDir yields a unique rvId(filesystem GUID as reported by blkid)
rvIDToPath := make(map[string]string, len(dc.cfg.CacheDirs))

for index, path := range dc.cfg.CacheDirs {
// TODO{Akku} : More than 1 cache dir with same rvId for rv, must fail distributed cache startup
rvId, err := getBlockDeviceUUId(path)
if err != nil {
return nil, log.LogAndReturnError(fmt.Sprintf("DistributedCache::Start error [failed to get raw volume UUID: %v]", err))
}

if existingPath, exists := rvIDToPath[rvId]; exists {
return nil, log.LogAndReturnError(fmt.Sprintf(
"DistributedCache::Start error [duplicate rvId %s: %s path rvId conflicts with path %s rvId]",
rvId, path, existingPath))
}

rvIDToPath[rvId] = path

totalSpace, availableSpace, err := common.GetDiskSpaceMetricsFromStatfs(path)
if err != nil {
return nil, log.LogAndReturnError(fmt.Sprintf("DistributedCache::Start error [failed to evaluate local cache Total space: %v]", err))
Expand Down
16 changes: 16 additions & 0 deletions internal/dcache/cluster_manager/cluster_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,11 @@ func cleanupRV(rv dcache.RawVolume) error {
// An RV that's offline in the RV list is guaranteed to be offline in the MV list also, i.e., no MV will contact
// this RV for for chunk IO (read or write).
//
// Before proceeding, ensure no duplicate RV IDs (filesystem GUIDs) exist across different cache paths —
//
// i.e., if an RV ID appears in both the input list and clustermap, it must refer to the *same* path.
// Any mismatch indicates a RVId collision, in that case, the startup will be aborted.
//
// In case of success the boolean return value indicates the following:
// true -> Found a clustermap and RV(s) were either not present in the RV list or waited for RV(s) to be marked
//
Expand Down Expand Up @@ -678,6 +683,17 @@ func (cmi *ClusterManager) safeCleanupMyRVs(myRVs []dcache.RawVolume) (bool, err
log.Info("ClusterManager::safeCleanupMyRVs: No my RV(s) in clustermap")
}

// Ensure no two different cache-dir report the same rvId(filesystem GUID)
for _, inputRV := range myRVs {
for _, cmRV := range myRVsFromClustermap {
if inputRV.RvId == cmRV.RvId && inputRV.LocalCachePath != cmRV.LocalCachePath {
return false, fmt.Errorf(
"ClusterManager::safeCleanupMyRVs: duplicate RVid (filesystem GUID) %s reported for paths %s and %s",
inputRV.RvId, cmRV.LocalCachePath, inputRV.LocalCachePath)
}
}
}

rvStillOnline := false
for _, rv := range myRVs {
log.Info("ClusterManager::safeCleanupMyRVs: Checking my RV %+v", rv)
Expand Down