diff --git a/.github/integration/tests/40_download.sh b/.github/integration/tests/40_download.sh index 1176c036..d6511dd6 100755 --- a/.github/integration/tests/40_download.sh +++ b/.github/integration/tests/40_download.sh @@ -1,9 +1,18 @@ #!/bin/bash set -e +# Create a user key pair +if ( yes "" | ./sda-cli createKey user_key ) ; then + echo "Created a user key pair for downloading encrypted files" +else + echo "Failed to create a user key pair for downloading encrypted files" + exit 1 +fi + # Download file by using the sda-cli download command -./sda-cli -config testing/s3cmd-download.conf download -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir test-download main/subfolder/dummy_data.c4gh +./sda-cli -config testing/s3cmd-download.conf download -pubkey user_key.pub.pem -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir test-download main/subfolder/dummy_data.c4gh +C4GH_PASSWORD="" ./sda-cli decrypt -key user_key.sec.pem test-download/main/subfolder/dummy_data.c4gh # Check if file exists in the path if [ ! -f "test-download/main/subfolder/dummy_data" ]; then echo "Downloaded file not found" @@ -20,13 +29,13 @@ fi rm -r test-download # Download whole dataset by using the sda-cli download command -./sda-cli -config testing/s3cmd-download.conf download -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-dataset --dataset +./sda-cli -config testing/s3cmd-download.conf download -pubkey user_key.pub.pem -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-dataset --dataset filepaths="download-dataset/main/subfolder/dummy_data download-dataset/main/subfolder2/dummy_data2 download-dataset/main/subfolder2/random/dummy_data3" # Check if all the files of the dataset have been downloaded for filepath in $filepaths; do - if [ ! -f "$filepath" ]; then + if [ ! -f "$filepath.c4gh" ]; then echo "File $filepath does not exist" exit 1 fi @@ -34,13 +43,6 @@ done rm -r download-dataset -# Create a user key pair -if ( yes "" | ./sda-cli createKey user_key ) ; then - echo "Created a user key pair for downloading encrypted files" -else - echo "Failed to create a user key pair for downloading encrypted files" - exit 1 -fi # Download encrypted file by using the sda-cli download command ./sda-cli -config testing/s3cmd-download.conf download -pubkey user_key.pub.pem -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir test-download main/subfolder/dummy_data.c4gh @@ -69,13 +71,13 @@ fi # Download recursively a folder echo "Downloading content of folder" -./sda-cli -config testing/s3cmd-download.conf download -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-folder --recursive main/subfolder2 +./sda-cli -config testing/s3cmd-download.conf download -pubkey user_key.pub.pem -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-folder --recursive main/subfolder2 folderpaths="download-folder/main/subfolder2/dummy_data2 download-folder/main/subfolder2/random/dummy_data3" # Check if the content of the folder has been downloaded for folderpath in $folderpaths; do - if [ ! -f "$folderpath" ]; then + if [ ! -f "$folderpath.c4gh" ]; then echo "Content of folder $folderpath is missing" exit 1 fi @@ -84,14 +86,15 @@ done rm -r download-folder # Download dataset by providing the dataset id -./sda-cli -config testing/s3cmd-download.conf download -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-fileid urn:neic:001-001 +./sda-cli -config testing/s3cmd-download.conf download -pubkey user_key.pub.pem -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-fileid urn:neic:001-001 # Check if file exists in the path -if [ ! -f "download-fileid/main/subfolder/dummy_data" ]; then +if [ ! -f "download-fileid/main/subfolder/dummy_data.c4gh" ]; then echo "Downloaded file by using the file id not found" exit 1 fi +C4GH_PASSWORD="" ./sda-cli decrypt -key user_key.sec.pem download-fileid/main/subfolder/dummy_data.c4gh # Check the first line of the file first_line_id=$(head -n 1 download-fileid/main/subfolder/dummy_data) if [[ $first_line_id != *"THIS FILE IS JUST DUMMY DATA"* ]]; then @@ -103,10 +106,10 @@ rm -r download-fileid # Download the file paths content of a text file echo "Downloading content of a text file" -./sda-cli -config testing/s3cmd-download.conf download -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-from-file --from-file testing/file-list.txt +./sda-cli -config testing/s3cmd-download.conf download -pubkey user_key.pub.pem -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-from-file --from-file testing/file-list.txt # Check if the content of the text file has been downloaded -content_paths="download-from-file/main/subfolder/dummy_data download-from-file/main/subfolder2/dummy_data2" +content_paths="download-from-file/main/subfolder/dummy_data.c4gh download-from-file/main/subfolder2/dummy_data2.c4gh" for content_path in $content_paths; do if [ ! -f "$content_path" ]; then @@ -115,6 +118,7 @@ for content_path in $content_paths; do fi done +C4GH_PASSWORD="" ./sda-cli decrypt -key user_key.sec.pem download-from-file/main/subfolder/dummy_data.c4gh # Check the first line of the file first_line_file=$(head -n 1 download-from-file/main/subfolder/dummy_data) if [[ $first_line_file != *"THIS FILE IS JUST DUMMY DATA"* ]]; then @@ -122,7 +126,16 @@ if [[ $first_line_file != *"THIS FILE IS JUST DUMMY DATA"* ]]; then exit 1 fi +# Make sure files cannot be downloaded without giving a public key +if ./sda-cli -config testing/s3cmd-download.conf download -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir test-download main/subfolder/dummy_data.c4gh; then + echo "Downloaded a file without using a public key" + exit 1 +else + echo "Error expected, continue." +fi + rm -r download-from-file rm -r test-download + echo "Integration tests for sda-cli download finished successfully" \ No newline at end of file diff --git a/download/download.go b/download/download.go index 1ba2a073..727056fa 100644 --- a/download/download.go +++ b/download/download.go @@ -2,14 +2,12 @@ package download import ( "bufio" - "encoding/base64" "encoding/json" "errors" "flag" "fmt" "io" "net/http" - "net/mail" "net/url" "os" "path/filepath" @@ -47,7 +45,8 @@ Required options: -url The url of the download server. Optional options: - -pubkey Encrypt downloaded files server-side using the specified public key. + -pubkey Key to use for encrypting downloaded files server-side. + This key must be given here or in the config file. -outdir Directory to save the downloaded files. If not specified, files will be saved in the current directory. -dataset Download all files in the dataset specified by '-dataset-id'. @@ -77,6 +76,8 @@ var recursiveDownload = Args.Bool("recursive", false, "Download content of the f var fromFile = Args.Bool("from-file", false, "Download files from file list.") +var pubKeyBase64 string + // necessary for mocking in testing var getResponseBody = getBody @@ -148,6 +149,10 @@ func Download(args []string, configPath string) error { if err != nil { return err } + pubKeyBase64, err = helpers.GetPublicKey64(pubKeyPath) + if err != nil { + return err + } switch { // Case where the user is setting the -dataset flag @@ -191,8 +196,12 @@ func datasetCase(token string) error { // Loop through the files and download them for _, file := range files { // Download URL for the file - fileURL := *URL + "/files/" + file.FileID - err = downloadFile(fileURL, token, "", file.FilePath) + fileName := helpers.AnonymizeFilepath(file.FilePath) + fileURL := *URL + "/s3/" + file.DatasetID + "/" + fileName + if err != nil { + return err + } + err = downloadFile(fileURL, token, pubKeyBase64, file.FilePath) if err != nil { return err } @@ -227,8 +236,9 @@ func recursiveCase(token string) error { for _, file := range files { if strings.Contains(file.FilePath, dirPath) { pathExists = true - fileURL := *URL + "/files/" + file.FileID - err = downloadFile(fileURL, token, "", file.FilePath) + fileName := helpers.AnonymizeFilepath(file.FilePath) + fileURL := *URL + "/s3/" + file.DatasetID + "/" + fileName + err = downloadFile(fileURL, token, pubKeyBase64, file.FilePath) if err != nil { return err } @@ -269,17 +279,6 @@ func fileCase(token string, fileList bool) error { files = append(files, Args.Args()...) } - *pubKeyPath = strings.TrimSpace(*pubKeyPath) - var pubKeyBase64 string - if *pubKeyPath != "" { - // Read the public key - pubKey, err := os.ReadFile(*pubKeyPath) - if err != nil { - return fmt.Errorf("failed to read public key, reason: %v", err) - } - pubKeyBase64 = base64.StdEncoding.EncodeToString(pubKey) - } - // Loop through the files and download them for _, filePath := range files { fileIDURL, apiFilePath, err := getFileIDURL(*URL, token, pubKeyBase64, *datasetID, filePath) @@ -300,13 +299,7 @@ func fileCase(token string, fileList bool) error { func downloadFile(uri, token, pubKeyBase64, filePath string) error { // Check if the file path contains a userID and if it does, // do not keep it in the file path - filePathSplit := strings.Split(filePath, "/") - if strings.Contains(filePathSplit[0], "_") { - _, err := mail.ParseAddress(strings.ReplaceAll(filePathSplit[0], "_", "@")) - if err == nil { - filePath = strings.Join(filePathSplit[1:], "/") - } - } + filePath = helpers.AnonymizeFilepath(filePath) outFilename := filePath if *outDir != "" { @@ -396,15 +389,10 @@ func getFileIDURL(baseURL, token, pubKeyBase64, dataset, filename string) (strin return "", "", fmt.Errorf("File not found in dataset %s", filename) } - var url string - // If no public key is provided, retrieve the unencrypted file - if pubKeyBase64 == "" { - url = baseURL + "/files/" + datasetFiles[idx].FileID - } else { - url = baseURL + "/s3-encrypted/" + dataset + "/" + filename - } + fileName := helpers.AnonymizeFilepath(datasetFiles[idx].FilePath) + url := baseURL + "/s3/" + dataset + "/" + fileName - return url, datasetFiles[idx].FilePath, nil + return url, fileName, nil } func GetDatasets(baseURL, token string) ([]string, error) { diff --git a/download/download_test.go b/download/download_test.go index 4cad790f..a8f48614 100644 --- a/download/download_test.go +++ b/download/download_test.go @@ -138,7 +138,7 @@ func (suite *TestSuite) TestDownloadUrl() { token := suite.accessToken datasetID := "test-dataset" filepath := "path/to/file1" - expectedURL := "https://some/url/files/file1id" + expectedURL := "https://some/url/s3/test-dataset/path/to/file1.c4gh" //----------------------------------------------- // Test with an empty public key @@ -170,16 +170,18 @@ func (suite *TestSuite) TestDownloadUrl() { //----------------------------------------------- // Test using a nonempty public key - // Test with valid base_url, token, dataset, and filename - expectedURL = baseURL + "/s3-encrypted/" + datasetID + "/" + filepath + // Test with valid base_url, token, dataset, and fileid + filepath = "path/to/file1.c4gh" + fileid := "file1id" + expectedURL = baseURL + "/s3/" + datasetID + "/" + filepath pubKey := "test-public-key" - url, _, err = getFileIDURL(baseURL, token, pubKey, datasetID, filepath) + url, _, err = getFileIDURL(baseURL, token, pubKey, datasetID, fileid) assert.NoError(suite.T(), err) assert.Equal(suite.T(), expectedURL, url) // Test with url as dataset datasetID = "https://doi.example/another/url/001" - expectedURL = baseURL + "/s3-encrypted/" + datasetID + "/" + filepath + expectedURL = baseURL + "/s3/" + datasetID + "/" + filepath url, _, err = getFileIDURL(baseURL, token, pubKey, datasetID, filepath) assert.NoError(suite.T(), err) assert.Equal(suite.T(), expectedURL, url) diff --git a/helpers/helpers.go b/helpers/helpers.go index 0e446ab2..86678185 100644 --- a/helpers/helpers.go +++ b/helpers/helpers.go @@ -1,12 +1,14 @@ package helpers import ( + "encoding/base64" "encoding/json" "encoding/xml" "errors" "flag" "fmt" "io" + "net/mail" "os" "path/filepath" "regexp" @@ -473,3 +475,34 @@ func CheckValidChars(filename string) error { return nil } + +// AnonymizeFilepath checks if the filepath has a prefixed user ID +// strips that, and then returns the filepath +func AnonymizeFilepath(filePath string) string { + filePathSplit := strings.Split(filePath, "/") + if strings.Contains(filePathSplit[0], "_") { + // prefixed user IDs are email adresses with '@' replaced by '_' + _, err := mail.ParseAddress(strings.ReplaceAll(filePathSplit[0], "_", "@")) + if err == nil { + filePath = strings.Join(filePathSplit[1:], "/") + } + } + + return filePath +} + +// Reads the public key and encodes it in base64 +func GetPublicKey64(pubKeyPath *string) (string, error) { + *pubKeyPath = strings.TrimSpace(*pubKeyPath) + var pubKeyBase64 string + if *pubKeyPath != "" { + // Read the public key + pubKey, err := os.ReadFile(*pubKeyPath) + if err != nil { + return "", fmt.Errorf("failed to read public key, reason: %v", err) + } + pubKeyBase64 = base64.StdEncoding.EncodeToString(pubKey) + } + + return pubKeyBase64, nil +} diff --git a/htsget/htsget_test.go b/htsget/htsget_test.go index 32bb3bb7..b8af352a 100644 --- a/htsget/htsget_test.go +++ b/htsget/htsget_test.go @@ -134,7 +134,7 @@ KKj6NUcJGZ2/HeqkYbxm57ZaFLP5cIHsdK+0nQubFVs= "url": "data:;base64,Y3J5cHQ0Z2gBAAAAAgAAAA==" }, { - "url": "http://localhost/s3-encrypted/DATASET0001/htsnexus_test_NA12878.bam.c4gh", + "url": "http://localhost/s3/DATASET0001/htsnexus_test_NA12878.bam.c4gh", "headers": { "Range": "bytes=16-123", "accept-encoding": "gzip", @@ -148,7 +148,7 @@ KKj6NUcJGZ2/HeqkYbxm57ZaFLP5cIHsdK+0nQubFVs= "url": "data:;base64,ZAAAAAAAAAB7zX5e64IzHWf5/X8nkdCKpwsX0eT4/AHU77sh2+EdIXwkSEyPQ5ZP2+vRHvytn6H1hf63Wo7gPdDc59KZfz+10kjywPqQUXYOoSbeQ6cxx2dxmf2nSwSd2Wh1jA==" }, { - "url": "http://localhost/s3-encrypted/DATASET0001/htsnexus_test_NA12878.bam.c4gh", + "url": "http://localhost/s3/DATASET0001/htsnexus_test_NA12878.bam.c4gh", "headers": { "Range": "bytes=124-1049147", "accept-encoding": "gzip", @@ -159,7 +159,7 @@ KKj6NUcJGZ2/HeqkYbxm57ZaFLP5cIHsdK+0nQubFVs= } }, { - "url": "http://localhost/s3-encrypted/DATASET0001/htsnexus_test_NA12878.bam.c4gh", + "url": "http://localhost/s3/DATASET0001/htsnexus_test_NA12878.bam.c4gh", "headers": { "Range": "bytes=2557120-2598042", "accept-encoding": "gzip", diff --git a/testing/docker-compose.yml b/testing/docker-compose.yml index d6834ff3..e304e6e8 100644 --- a/testing/docker-compose.yml +++ b/testing/docker-compose.yml @@ -201,7 +201,7 @@ services: - GRPC_PORT=50051 - GRPC_HOST=reencrypt - APP_SERVEUNENCRYPTEDDATA=true - image: "ghcr.io/neicnordic/sensitive-data-archive:v0.3.179-download" #this is the last version that supports /s3-encrypted + image: "ghcr.io/neicnordic/sensitive-data-archive:${TAG}-download" volumes: - ./archive_data/4293c9a7-dc50-46db-b79a-27ddc0dad1c6:/tmp/4293c9a7-dc50-46db-b79a-27ddc0dad1c6 mem_limit: 256m