Skip to content

Commit

Permalink
[8.x] [ML] Trained Models: Cannot deploy a model after a failed deplo…
Browse files Browse the repository at this point in the history
…yment (#211459) (#212353)

# Backport

This will backport the following commits from `main` to `8.x`:
- [[ML] Trained Models: Cannot deploy a model after a failed deployment
(#211459)](#211459)

<!--- Backport version: 9.6.6 -->

### Questions ?
Please refer to the [Backport tool
documentation](https://github.com/sorenlouv/backport)

<!--BACKPORT [{"author":{"name":"Robert
Jaszczurek","email":"92210485+rbrtj@users.noreply.github.com"},"sourceCommit":{"committedDate":"2025-02-19T09:45:54Z","message":"[ML]
Trained Models: Cannot deploy a model after a failed deployment
(#211459)\n\nAfter the recent changes
in\nhttps://github.com//pull/205699\nIf a deployment
fails, the error will be handled correctly at a single\ndeployment
level, however, the pipeline would break, thus further\ndeployments
wouldn't be
proceeded.","sha":"58cea843e915ba4dd7184f7718cb50feca77d05a","branchLabelMapping":{"^v9.1.0$":"main","^v8.19.0$":"8.x","^v(\\d+).(\\d+).\\d+$":"$1.$2"}},"sourcePullRequest":{"labels":["release_note:fix",":ml","backport
missing","Team:ML","backport:version","v9.1.0","v8.19.0"],"title":"[ML]
Trained Models: Cannot deploy a model after a failed
deployment","number":211459,"url":"https://github.com/elastic/kibana/pull/211459","mergeCommit":{"message":"[ML]
Trained Models: Cannot deploy a model after a failed deployment
(#211459)\n\nAfter the recent changes
in\nhttps://github.com//pull/205699\nIf a deployment
fails, the error will be handled correctly at a single\ndeployment
level, however, the pipeline would break, thus further\ndeployments
wouldn't be
proceeded.","sha":"58cea843e915ba4dd7184f7718cb50feca77d05a"}},"sourceBranch":"main","suggestedTargetBranches":["8.x"],"targetPullRequestStates":[{"branch":"main","label":"v9.1.0","branchLabelMappingKey":"^v9.1.0$","isSourceBranch":true,"state":"MERGED","url":"https://github.com/elastic/kibana/pull/211459","number":211459,"mergeCommit":{"message":"[ML]
Trained Models: Cannot deploy a model after a failed deployment
(#211459)\n\nAfter the recent changes
in\nhttps://github.com//pull/205699\nIf a deployment
fails, the error will be handled correctly at a single\ndeployment
level, however, the pipeline would break, thus further\ndeployments
wouldn't be
proceeded.","sha":"58cea843e915ba4dd7184f7718cb50feca77d05a"}},{"branch":"8.x","label":"v8.19.0","branchLabelMappingKey":"^v8.19.0$","isSourceBranch":false,"state":"NOT_CREATED"}]}]
BACKPORT-->
  • Loading branch information
rbrtj authored Feb 25, 2025
1 parent 221205c commit d3d4cae
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 54 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,38 @@ describe('TrainedModelsService', () => {
let scheduledDeploymentsSubject: BehaviorSubject<StartAllocationParams[]>;
let mockSetScheduledDeployments: jest.Mock<any, any>;

const startModelAllocationResponseMock = {
assignment: {
task_parameters: {
model_id: 'deploy-model',
model_bytes: 1000,
allocation_id: 'test-allocation',
priority: 'normal',
number_of_allocations: 1,
threads_per_allocation: 1,
queue_capacity: 1024,
deployment_id: 'my-deployment-id',
cache_size: '1mb',
},
node_count: 1,
routing_table: {
'node-1': {
routing_state: 'started',
reason: '',
current_allocations: 1,
target_allocations: 1,
},
},
assignment_state: 'started',
start_time: 1234567890,
adaptive_allocations: {
enabled: true,
min_number_of_allocations: 1,
max_number_of_allocations: 4,
},
} as const,
};

const mockDisplayErrorToast = jest.fn();
const mockDisplaySuccessToast = jest.fn();

Expand Down Expand Up @@ -189,37 +221,7 @@ describe('TrainedModelsService', () => {
mockTrainedModelsApiService.getTrainedModelsList.mockResolvedValueOnce([mockModel]);

mockTrainedModelsApiService.startModelAllocation.mockReturnValueOnce(
of({
assignment: {
task_parameters: {
model_id: 'deploy-model',
model_bytes: 1000,
allocation_id: 'test-allocation',
priority: 'normal',
number_of_allocations: 1,
threads_per_allocation: 1,
queue_capacity: 1024,
deployment_id: 'my-deployment-id',
cache_size: '1mb',
},
node_count: 1,
routing_table: {
'node-1': {
routing_state: 'started',
reason: '',
current_allocations: 1,
target_allocations: 1,
},
},
assignment_state: 'started',
start_time: 1234567890,
adaptive_allocations: {
enabled: true,
min_number_of_allocations: 1,
max_number_of_allocations: 4,
},
},
})
of(startModelAllocationResponseMock)
);

// Start deployment
Expand Down Expand Up @@ -345,4 +347,53 @@ describe('TrainedModelsService', () => {
})
);
});

it('allows new deployments after a failed deployment', async () => {
const mockModel = {
model_id: 'test-model',
state: MODEL_STATE.DOWNLOADED,
type: ['pytorch'],
} as unknown as TrainedModelUIItem;

mockTrainedModelsApiService.getTrainedModelsList.mockResolvedValue([mockModel]);

mockTrainedModelsApiService.startModelAllocation
.mockReturnValueOnce(throwError(() => new Error('First deployment failed')))
.mockReturnValueOnce(of(startModelAllocationResponseMock));

// First deployment
trainedModelsService.startModelDeployment('test-model', {
deployment_id: 'first-deployment',
priority: 'low',
threads_per_allocation: 1,
});

jest.advanceTimersByTime(100);
await flushPromises();

expect(mockDisplayErrorToast).toHaveBeenCalledWith(
expect.any(Error),
expect.stringContaining('first-deployment')
);

jest.advanceTimersByTime(100);
await flushPromises();

// Second deployment
trainedModelsService.startModelDeployment('test-model', {
deployment_id: 'second-deployment',
priority: 'low',
threads_per_allocation: 1,
});

jest.advanceTimersByTime(100);
await flushPromises();

expect(mockTrainedModelsApiService.startModelAllocation).toHaveBeenCalledTimes(2);
expect(mockDisplaySuccessToast).toHaveBeenCalledWith(
expect.objectContaining({
text: expect.stringContaining('second-deployment'),
})
);
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -441,29 +441,31 @@ export class TrainedModelsService {
}),
});
},
error: (error) => {
this.displayErrorToast?.(
error,
i18n.translate('xpack.ml.trainedModels.modelsList.startFailed', {
defaultMessage: 'Failed to start "{deploymentId}"',
values: {
deploymentId: deployment.deploymentParams.deployment_id,
},
})
);
},
finalize: () => {
this.removeScheduledDeployments({
deploymentId: deployment.deploymentParams.deployment_id!,
});
// Manually update the BehaviorSubject to ensure proper cleanup
// if user navigates away, as localStorage hook won't be available to handle updates
const updatedDeployments = this._scheduledDeployments$
.getValue()
.filter((d) => d.modelId !== deployment.modelId);
this._scheduledDeployments$.next(updatedDeployments);
this.fetchModels();
},
}),
catchError((error) => {
this.displayErrorToast?.(
error,
i18n.translate('xpack.ml.trainedModels.modelsList.startFailed', {
defaultMessage: 'Failed to start "{deploymentId}"',
values: {
deploymentId: deployment.deploymentParams.deployment_id,
},
})
);
// Return null to allow stream to continue
return of(null);
}),
finalize(() => {
this.removeScheduledDeployments({
deploymentId: deployment.deploymentParams.deployment_id!,
});
// Manually update the BehaviorSubject to ensure proper cleanup
// if user navigates away, as localStorage hook won't be available to handle updates
const updatedDeployments = this._scheduledDeployments$
.getValue()
.filter((d) => d.modelId !== deployment.modelId);
this._scheduledDeployments$.next(updatedDeployments);
this.fetchModels();
})
)
);
Expand Down

0 comments on commit d3d4cae

Please sign in to comment.