Skip to content

Data sharing manager bug and scaling fixes #970

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 29 additions & 2 deletions lib/workload/stateless/stacks/data-sharing-manager/deploy/stack.ts
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ export class DataSharingStack extends Stack {
// Add container to task role
const dataSummaryReportContainer = taskDefinition.addContainer('dataSummaryReportContainer', {
image: ecs.ContainerImage.fromDockerImageAsset(
new ecrAssets.DockerImageAsset(this, 'gzipRawMd5sum', {
new ecrAssets.DockerImageAsset(this, 'data_summary_reporter', {
directory: path.join(__dirname, '../ecs/tasks/generate_data_summary_report'),
buildArgs: {
TARGETPLATFORM: architecture.dockerPlatform,
Expand Down Expand Up @@ -656,10 +656,37 @@ export class DataSharingStack extends Stack {
})
);

// Pusher requires permissions to execute itself
// Because this steps execution uses a distributed map in its step function, we
// have to wire up some extra permissions
// Grant the state machine's role to execute itself
// However we cannot just grant permission to the role as this will result in a circular dependency
// between the state machine and the role
// Instead we use the workaround here - https://github.com/aws/aws-cdk/issues/28820#issuecomment-1936010520
// packagingStateMachine.grantStartExecution(packagingStateMachine);
const distributedMapPolicy = new iam.Policy(this, 'push-sfn-distributed-map-policy', {
document: new iam.PolicyDocument({
statements: [
new iam.PolicyStatement({
resources: [s3PushStateMachine.stateMachineArn],
actions: ['states:StartExecution'],
}),
new iam.PolicyStatement({
resources: [
`arn:aws:states:${cdk.Aws.REGION}:${cdk.Aws.ACCOUNT_ID}:execution:${s3PushStateMachine.stateMachineName}/*:*`,
],
actions: ['states:RedriveExecution'],
}),
],
}),
});
// Add the policy to the state machine role
s3PushStateMachine.role.attachInlinePolicy(distributedMapPolicy);

// https://docs.aws.amazon.com/step-functions/latest/dg/connect-stepfunctions.html#sync-async-iam-policies
// Polling requires permission for states:DescribeExecution
NagSuppressions.addResourceSuppressions(
s3PushStateMachine,
[s3PushStateMachine, distributedMapPolicy],
[
{
id: 'AwsSolutions-IAM5',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ class LibraryModel(pa.DataFrameModel):
"tumor", "normal", "negative-control"
])
workflow: str = pa.Field()
quality: str = pa.Field()
quality: Optional[str] = pa.Field(nullable=True)
type: str = pa.Field()
assay: str = pa.Field()
coverage: float = pa.Field(ge=0, coerce=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from dyntastic import Dyntastic
from fastapi.encoders import jsonable_encoder
from pydantic import Field, BaseModel, ConfigDict, model_validator, computed_field
from datetime import datetime, timedelta
from datetime import datetime, timedelta, timezone
from enum import Enum

from data_sharing_tools.utils.models import SecondaryAnalysisDataTypeEnum
Expand Down Expand Up @@ -157,7 +157,7 @@ class PackageResponse(PackageWithId):
@computed_field
def has_expired(self) -> bool:
return (
True if PackageData(**self.model_dump()).is_expired()
True if PackageData.get(self.id).is_expired()
else False
)

Expand Down Expand Up @@ -230,7 +230,7 @@ def model_dump(self, **kwargs):

def is_expired(self):
return (
True if (self.request_time + timedelta(days=30)) < datetime.now()
True if (self.request_time + timedelta(days=30)) < datetime.now(timezone.utc)
else False
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,27 +96,27 @@ def handler(event, context) -> Dict[str, List[Dict[str, str]]]:
}


if __name__ == "__main__":
from os import environ
import json
environ['AWS_REGION'] = 'ap-southeast-2'
environ['AWS_PROFILE'] = 'umccr-production'
environ['HOSTNAME_SSM_PARAMETER'] = '/hosted_zone/umccr/name'
environ['ORCABUS_TOKEN_SECRET_ID'] = 'orcabus/token-service-jwt'
print(json.dumps(
handler(
{
"ingestIdList": [
"01960822-f3ee-77d0-8fa4-91d5c2e17ad9",
]
},
None
),
indent=4)
)

# {
# "ingestId": "01960822-f3ee-77d0-8fa4-91d5c2e17ad9",
# "presignedUrl": "...",
# "presignedExpiry": "2025-04-14T22:52:35Z"
# },
# if __name__ == "__main__":
# from os import environ
# import json
# environ['AWS_REGION'] = 'ap-southeast-2'
# environ['AWS_PROFILE'] = 'umccr-production'
# environ['HOSTNAME_SSM_PARAMETER'] = '/hosted_zone/umccr/name'
# environ['ORCABUS_TOKEN_SECRET_ID'] = 'orcabus/token-service-jwt'
# print(json.dumps(
# handler(
# {
# "ingestIdList": [
# "01960822-f3ee-77d0-8fa4-91d5c2e17ad9",
# ]
# },
# None
# ),
# indent=4)
# )
#
# # {
# # "ingestId": "01960822-f3ee-77d0-8fa4-91d5c2e17ad9",
# # "presignedUrl": "...",
# # "presignedExpiry": "2025-04-14T22:52:35Z"
# # },
Original file line number Diff line number Diff line change
Expand Up @@ -327,48 +327,48 @@ def handler(event: Dict, context: Any) -> Dict[str, List[str]]:
# #


if __name__ == "__main__":
from os import environ
import json

environ['AWS_PROFILE'] = 'umccr-production'
environ['AWS_REGION'] = 'ap-southeast-2'
environ['HOSTNAME_SSM_PARAMETER'] = '/hosted_zone/umccr/name'
environ['ORCABUS_TOKEN_SECRET_ID'] = 'orcabus/token-service-jwt'
print(
json.dumps(
handler(
{
"workflowRunObject": {
"orcabusId": "wv1.01HZB3XW00SD59RB456ZFBM1M6",
"timestamp": "2024-06-02T00:00:00Z",
"workflowName": "umccrise",
"workflowVersion": "2.3.1--1--9344851",
"portalRunId": "20240602e4238704",
"libraries": [
{
"libraryId": "L2400668",
"orcabusId": "lib.01JBMVY7RK8ZDRZEKMPV8K60Z3"
},
{
"libraryId": "L2400667",
"orcabusId": "lib.01JBMVY7QFA11HR5J4JS3Y2Y6K"
}
]
}
},
None
),
indent=4
)
)

# {
# "ingestIdList": [
# "01932e37-01a1-7d12-9a69-006d44292b5b",
# "01932e37-0290-76e0-8d03-b624c925c4ae",
# ....
# "01932e41-26a2-7b31-9487-c6e679d80cd1"
# ]
# }
# if __name__ == "__main__":
# from os import environ
# import json
#
# environ['AWS_PROFILE'] = 'umccr-production'
# environ['AWS_REGION'] = 'ap-southeast-2'
# environ['HOSTNAME_SSM_PARAMETER'] = '/hosted_zone/umccr/name'
# environ['ORCABUS_TOKEN_SECRET_ID'] = 'orcabus/token-service-jwt'
# print(
# json.dumps(
# handler(
# {
# "workflowRunObject": {
# "orcabusId": "wv1.01HZB3XW00SD59RB456ZFBM1M6",
# "timestamp": "2024-06-02T00:00:00Z",
# "workflowName": "umccrise",
# "workflowVersion": "2.3.1--1--9344851",
# "portalRunId": "20240602e4238704",
# "libraries": [
# {
# "libraryId": "L2400668",
# "orcabusId": "lib.01JBMVY7RK8ZDRZEKMPV8K60Z3"
# },
# {
# "libraryId": "L2400667",
# "orcabusId": "lib.01JBMVY7QFA11HR5J4JS3Y2Y6K"
# }
# ]
# }
# },
# None
# ),
# indent=4
# )
# )
#
# # {
# # "ingestIdList": [
# # "01932e37-01a1-7d12-9a69-006d44292b5b",
# # "01932e37-0290-76e0-8d03-b624c925c4ae",
# # ....
# # "01932e41-26a2-7b31-9487-c6e679d80cd1"
# # ]
# # }
# #
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ def handler(event, context) -> Dict[str, List[Dict[str, str]]]:
# Extract the jobId and pushLocation from the event
job_id = event.get("packagingJobId")
push_location = event.get("pushLocation")
count_only = event.get("countOnly", False)
pagination_index = event.get("paginationIndex", None)

# Check if the jobId and pushLocation are provided
if not job_id or not push_location:
Expand Down Expand Up @@ -108,6 +110,41 @@ def handler(event, context) -> Dict[str, List[Dict[str, str]]]:
}
)

destination_and_source_uri_mappings_list.sort(
key=lambda x: x["destinationUri"]
)

# If count only is true, return the count of the destination and source uri mappings list
if count_only:
return {
"listCount": len(destination_and_source_uri_mappings_list)
}

if pagination_index:
return {
"destinationAndSourceUriMappingsList": destination_and_source_uri_mappings_list[pagination_index[0]:pagination_index[1]+1]
}

return {
"destinationAndSourceUriMappingsList": destination_and_source_uri_mappings_list
}


# if __name__ == "__main__":
# import json
# from os import environ
#
# environ['AWS_PROFILE'] = 'umccr-production'
# environ['PACKAGING_TABLE_NAME'] = 'data-sharing-packaging-lookup-table'
# environ['CONTENT_INDEX_NAME'] = 'content-index'
#
# print(
# handler(
# {
# "packagingJobId": "pkg.01JS1553E52WEZ43CXVYRSPM5N",
# "pushLocation": "s3://radio-fastq-landing/2025-04-17-cttsov2/",
# "paginationList": [0, 23],
# },
# None
# )
# )
Original file line number Diff line number Diff line change
Expand Up @@ -206,45 +206,45 @@ def handler(event, context) -> Dict[str, WorkflowRunModelSlim]:
}


if __name__ == "__main__":
from os import environ
import json

# Set envs
environ['AWS_PROFILE'] = 'umccr-production'
environ['AWS_REGION'] = 'ap-southeast-2'
environ['HOSTNAME_SSM_PARAMETER'] = '/hosted_zone/umccr/name'
environ['ORCABUS_TOKEN_SECRET_ID'] = 'orcabus/token-service-jwt'
environ['ATHENA_WORKGROUP_NAME'] = 'orcahouse'
environ['ATHENA_DATASOURCE_NAME'] = 'orcavault'
environ['ATHENA_DATABASE_NAME'] = 'mart'

print(json.dumps(
handler(
{
"portalRunId": "20240420746761e7"
},
None
),
indent=4
))

# {
# "workflowRunObject": {
# "orcabusId": "wv1.01HKGKG700SJ3EW38M7RP8K8BR",
# "timestamp": "2024-01-07T00:00:00Z",
# "workflowName": "umccrise",
# "workflowVersion": "2.3.1--1--9344851",
# "portalRunId": "202401075d94d609",
# "libraries": [
# {
# "libraryId": "L2301517",
# "orcabusId": "lib.01JBMVJ2EPCW8W051H82JF4MTX"
# },
# {
# "libraryId": "L2301512",
# "orcabusId": "lib.01JBMVJ1DQHB5HA6MP7BDYE94K"
# }
# ]
# }
# }
# if __name__ == "__main__":
# from os import environ
# import json
#
# # Set envs
# environ['AWS_PROFILE'] = 'umccr-production'
# environ['AWS_REGION'] = 'ap-southeast-2'
# environ['HOSTNAME_SSM_PARAMETER'] = '/hosted_zone/umccr/name'
# environ['ORCABUS_TOKEN_SECRET_ID'] = 'orcabus/token-service-jwt'
# environ['ATHENA_WORKGROUP_NAME'] = 'orcahouse'
# environ['ATHENA_DATASOURCE_NAME'] = 'orcavault'
# environ['ATHENA_DATABASE_NAME'] = 'mart'
#
# print(json.dumps(
# handler(
# {
# "portalRunId": "20240420746761e7"
# },
# None
# ),
# indent=4
# ))
#
# # {
# # "workflowRunObject": {
# # "orcabusId": "wv1.01HKGKG700SJ3EW38M7RP8K8BR",
# # "timestamp": "2024-01-07T00:00:00Z",
# # "workflowName": "umccrise",
# # "workflowVersion": "2.3.1--1--9344851",
# # "portalRunId": "202401075d94d609",
# # "libraries": [
# # {
# # "libraryId": "L2301517",
# # "orcabusId": "lib.01JBMVJ2EPCW8W051H82JF4MTX"
# # },
# # {
# # "libraryId": "L2301512",
# # "orcabusId": "lib.01JBMVJ1DQHB5HA6MP7BDYE94K"
# # }
# # ]
# # }
# # }
Loading
Loading