Skip to content

Commit

Permalink
Remove usage of s3-encrypted (#506)
Browse files Browse the repository at this point in the history
  • Loading branch information
MalinAhlberg authored Feb 24, 2025
2 parents c11b209 + 43e6ae2 commit 5e622b3
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 58 deletions.
45 changes: 29 additions & 16 deletions .github/integration/tests/40_download.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
#!/bin/bash
set -e

# Create a user key pair
if ( yes "" | ./sda-cli createKey user_key ) ; then
echo "Created a user key pair for downloading encrypted files"
else
echo "Failed to create a user key pair for downloading encrypted files"
exit 1
fi

# Download file by using the sda-cli download command
./sda-cli -config testing/s3cmd-download.conf download -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir test-download main/subfolder/dummy_data.c4gh
./sda-cli -config testing/s3cmd-download.conf download -pubkey user_key.pub.pem -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir test-download main/subfolder/dummy_data.c4gh

C4GH_PASSWORD="" ./sda-cli decrypt -key user_key.sec.pem test-download/main/subfolder/dummy_data.c4gh
# Check if file exists in the path
if [ ! -f "test-download/main/subfolder/dummy_data" ]; then
echo "Downloaded file not found"
Expand All @@ -20,27 +29,20 @@ fi
rm -r test-download

# Download whole dataset by using the sda-cli download command
./sda-cli -config testing/s3cmd-download.conf download -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-dataset --dataset
./sda-cli -config testing/s3cmd-download.conf download -pubkey user_key.pub.pem -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-dataset --dataset

filepaths="download-dataset/main/subfolder/dummy_data download-dataset/main/subfolder2/dummy_data2 download-dataset/main/subfolder2/random/dummy_data3"

# Check if all the files of the dataset have been downloaded
for filepath in $filepaths; do
if [ ! -f "$filepath" ]; then
if [ ! -f "$filepath.c4gh" ]; then
echo "File $filepath does not exist"
exit 1
fi
done

rm -r download-dataset

# Create a user key pair
if ( yes "" | ./sda-cli createKey user_key ) ; then
echo "Created a user key pair for downloading encrypted files"
else
echo "Failed to create a user key pair for downloading encrypted files"
exit 1
fi
# Download encrypted file by using the sda-cli download command
./sda-cli -config testing/s3cmd-download.conf download -pubkey user_key.pub.pem -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir test-download main/subfolder/dummy_data.c4gh

Expand Down Expand Up @@ -69,13 +71,13 @@ fi

# Download recursively a folder
echo "Downloading content of folder"
./sda-cli -config testing/s3cmd-download.conf download -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-folder --recursive main/subfolder2
./sda-cli -config testing/s3cmd-download.conf download -pubkey user_key.pub.pem -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-folder --recursive main/subfolder2

folderpaths="download-folder/main/subfolder2/dummy_data2 download-folder/main/subfolder2/random/dummy_data3"

# Check if the content of the folder has been downloaded
for folderpath in $folderpaths; do
if [ ! -f "$folderpath" ]; then
if [ ! -f "$folderpath.c4gh" ]; then
echo "Content of folder $folderpath is missing"
exit 1
fi
Expand All @@ -84,14 +86,15 @@ done
rm -r download-folder

# Download dataset by providing the dataset id
./sda-cli -config testing/s3cmd-download.conf download -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-fileid urn:neic:001-001
./sda-cli -config testing/s3cmd-download.conf download -pubkey user_key.pub.pem -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-fileid urn:neic:001-001

# Check if file exists in the path
if [ ! -f "download-fileid/main/subfolder/dummy_data" ]; then
if [ ! -f "download-fileid/main/subfolder/dummy_data.c4gh" ]; then
echo "Downloaded file by using the file id not found"
exit 1
fi

C4GH_PASSWORD="" ./sda-cli decrypt -key user_key.sec.pem download-fileid/main/subfolder/dummy_data.c4gh
# Check the first line of the file
first_line_id=$(head -n 1 download-fileid/main/subfolder/dummy_data)
if [[ $first_line_id != *"THIS FILE IS JUST DUMMY DATA"* ]]; then
Expand All @@ -103,10 +106,10 @@ rm -r download-fileid

# Download the file paths content of a text file
echo "Downloading content of a text file"
./sda-cli -config testing/s3cmd-download.conf download -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-from-file --from-file testing/file-list.txt
./sda-cli -config testing/s3cmd-download.conf download -pubkey user_key.pub.pem -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-from-file --from-file testing/file-list.txt

# Check if the content of the text file has been downloaded
content_paths="download-from-file/main/subfolder/dummy_data download-from-file/main/subfolder2/dummy_data2"
content_paths="download-from-file/main/subfolder/dummy_data.c4gh download-from-file/main/subfolder2/dummy_data2.c4gh"

for content_path in $content_paths; do
if [ ! -f "$content_path" ]; then
Expand All @@ -115,14 +118,24 @@ for content_path in $content_paths; do
fi
done

C4GH_PASSWORD="" ./sda-cli decrypt -key user_key.sec.pem download-from-file/main/subfolder/dummy_data.c4gh
# Check the first line of the file
first_line_file=$(head -n 1 download-from-file/main/subfolder/dummy_data)
if [[ $first_line_file != *"THIS FILE IS JUST DUMMY DATA"* ]]; then
echo "First line does not contain the expected string"
exit 1
fi

# Make sure files cannot be downloaded without giving a public key
if ./sda-cli -config testing/s3cmd-download.conf download -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir test-download main/subfolder/dummy_data.c4gh; then
echo "Downloaded a file without using a public key"
exit 1
else
echo "Error expected, continue."
fi

rm -r download-from-file
rm -r test-download


echo "Integration tests for sda-cli download finished successfully"
54 changes: 21 additions & 33 deletions download/download.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,12 @@ package download

import (
"bufio"
"encoding/base64"
"encoding/json"
"errors"
"flag"
"fmt"
"io"
"net/http"
"net/mail"
"net/url"
"os"
"path/filepath"
Expand Down Expand Up @@ -47,7 +45,8 @@ Required options:
-url <uri> The url of the download server.
Optional options:
-pubkey <public-key-file> Encrypt downloaded files server-side using the specified public key.
-pubkey <public-key-file> Key to use for encrypting downloaded files server-side.
This key must be given here or in the config file.
-outdir <dir> Directory to save the downloaded files.
If not specified, files will be saved in the current directory.
-dataset Download all files in the dataset specified by '-dataset-id'.
Expand Down Expand Up @@ -77,6 +76,8 @@ var recursiveDownload = Args.Bool("recursive", false, "Download content of the f

var fromFile = Args.Bool("from-file", false, "Download files from file list.")

var pubKeyBase64 string

// necessary for mocking in testing
var getResponseBody = getBody

Expand Down Expand Up @@ -148,6 +149,10 @@ func Download(args []string, configPath string) error {
if err != nil {
return err
}
pubKeyBase64, err = helpers.GetPublicKey64(pubKeyPath)
if err != nil {
return err
}

switch {
// Case where the user is setting the -dataset flag
Expand Down Expand Up @@ -191,8 +196,12 @@ func datasetCase(token string) error {
// Loop through the files and download them
for _, file := range files {
// Download URL for the file
fileURL := *URL + "/files/" + file.FileID
err = downloadFile(fileURL, token, "", file.FilePath)
fileName := helpers.AnonymizeFilepath(file.FilePath)
fileURL := *URL + "/s3/" + file.DatasetID + "/" + fileName
if err != nil {
return err
}
err = downloadFile(fileURL, token, pubKeyBase64, file.FilePath)
if err != nil {
return err
}
Expand Down Expand Up @@ -227,8 +236,9 @@ func recursiveCase(token string) error {
for _, file := range files {
if strings.Contains(file.FilePath, dirPath) {
pathExists = true
fileURL := *URL + "/files/" + file.FileID
err = downloadFile(fileURL, token, "", file.FilePath)
fileName := helpers.AnonymizeFilepath(file.FilePath)
fileURL := *URL + "/s3/" + file.DatasetID + "/" + fileName
err = downloadFile(fileURL, token, pubKeyBase64, file.FilePath)
if err != nil {
return err
}
Expand Down Expand Up @@ -269,17 +279,6 @@ func fileCase(token string, fileList bool) error {
files = append(files, Args.Args()...)
}

*pubKeyPath = strings.TrimSpace(*pubKeyPath)
var pubKeyBase64 string
if *pubKeyPath != "" {
// Read the public key
pubKey, err := os.ReadFile(*pubKeyPath)
if err != nil {
return fmt.Errorf("failed to read public key, reason: %v", err)
}
pubKeyBase64 = base64.StdEncoding.EncodeToString(pubKey)
}

// Loop through the files and download them
for _, filePath := range files {
fileIDURL, apiFilePath, err := getFileIDURL(*URL, token, pubKeyBase64, *datasetID, filePath)
Expand All @@ -300,13 +299,7 @@ func fileCase(token string, fileList bool) error {
func downloadFile(uri, token, pubKeyBase64, filePath string) error {
// Check if the file path contains a userID and if it does,
// do not keep it in the file path
filePathSplit := strings.Split(filePath, "/")
if strings.Contains(filePathSplit[0], "_") {
_, err := mail.ParseAddress(strings.ReplaceAll(filePathSplit[0], "_", "@"))
if err == nil {
filePath = strings.Join(filePathSplit[1:], "/")
}
}
filePath = helpers.AnonymizeFilepath(filePath)

outFilename := filePath
if *outDir != "" {
Expand Down Expand Up @@ -396,15 +389,10 @@ func getFileIDURL(baseURL, token, pubKeyBase64, dataset, filename string) (strin
return "", "", fmt.Errorf("File not found in dataset %s", filename)
}

var url string
// If no public key is provided, retrieve the unencrypted file
if pubKeyBase64 == "" {
url = baseURL + "/files/" + datasetFiles[idx].FileID
} else {
url = baseURL + "/s3-encrypted/" + dataset + "/" + filename
}
fileName := helpers.AnonymizeFilepath(datasetFiles[idx].FilePath)
url := baseURL + "/s3/" + dataset + "/" + fileName

return url, datasetFiles[idx].FilePath, nil
return url, fileName, nil
}

func GetDatasets(baseURL, token string) ([]string, error) {
Expand Down
12 changes: 7 additions & 5 deletions download/download_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ func (suite *TestSuite) TestDownloadUrl() {
token := suite.accessToken
datasetID := "test-dataset"
filepath := "path/to/file1"
expectedURL := "https://some/url/files/file1id"
expectedURL := "https://some/url/s3/test-dataset/path/to/file1.c4gh"

//-----------------------------------------------
// Test with an empty public key
Expand Down Expand Up @@ -170,16 +170,18 @@ func (suite *TestSuite) TestDownloadUrl() {

//-----------------------------------------------
// Test using a nonempty public key
// Test with valid base_url, token, dataset, and filename
expectedURL = baseURL + "/s3-encrypted/" + datasetID + "/" + filepath
// Test with valid base_url, token, dataset, and fileid
filepath = "path/to/file1.c4gh"
fileid := "file1id"
expectedURL = baseURL + "/s3/" + datasetID + "/" + filepath
pubKey := "test-public-key"
url, _, err = getFileIDURL(baseURL, token, pubKey, datasetID, filepath)
url, _, err = getFileIDURL(baseURL, token, pubKey, datasetID, fileid)
assert.NoError(suite.T(), err)
assert.Equal(suite.T(), expectedURL, url)

// Test with url as dataset
datasetID = "https://doi.example/another/url/001"
expectedURL = baseURL + "/s3-encrypted/" + datasetID + "/" + filepath
expectedURL = baseURL + "/s3/" + datasetID + "/" + filepath
url, _, err = getFileIDURL(baseURL, token, pubKey, datasetID, filepath)
assert.NoError(suite.T(), err)
assert.Equal(suite.T(), expectedURL, url)
Expand Down
33 changes: 33 additions & 0 deletions helpers/helpers.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
package helpers

import (
"encoding/base64"
"encoding/json"
"encoding/xml"
"errors"
"flag"
"fmt"
"io"
"net/mail"
"os"
"path/filepath"
"regexp"
Expand Down Expand Up @@ -473,3 +475,34 @@ func CheckValidChars(filename string) error {

return nil
}

// AnonymizeFilepath checks if the filepath has a prefixed user ID
// strips that, and then returns the filepath
func AnonymizeFilepath(filePath string) string {
filePathSplit := strings.Split(filePath, "/")
if strings.Contains(filePathSplit[0], "_") {
// prefixed user IDs are email adresses with '@' replaced by '_'
_, err := mail.ParseAddress(strings.ReplaceAll(filePathSplit[0], "_", "@"))
if err == nil {
filePath = strings.Join(filePathSplit[1:], "/")
}
}

return filePath
}

// Reads the public key and encodes it in base64
func GetPublicKey64(pubKeyPath *string) (string, error) {
*pubKeyPath = strings.TrimSpace(*pubKeyPath)
var pubKeyBase64 string
if *pubKeyPath != "" {
// Read the public key
pubKey, err := os.ReadFile(*pubKeyPath)
if err != nil {
return "", fmt.Errorf("failed to read public key, reason: %v", err)
}
pubKeyBase64 = base64.StdEncoding.EncodeToString(pubKey)
}

return pubKeyBase64, nil
}
6 changes: 3 additions & 3 deletions htsget/htsget_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ KKj6NUcJGZ2/HeqkYbxm57ZaFLP5cIHsdK+0nQubFVs=
"url": "data:;base64,Y3J5cHQ0Z2gBAAAAAgAAAA=="
},
{
"url": "http://localhost/s3-encrypted/DATASET0001/htsnexus_test_NA12878.bam.c4gh",
"url": "http://localhost/s3/DATASET0001/htsnexus_test_NA12878.bam.c4gh",
"headers": {
"Range": "bytes=16-123",
"accept-encoding": "gzip",
Expand All @@ -148,7 +148,7 @@ KKj6NUcJGZ2/HeqkYbxm57ZaFLP5cIHsdK+0nQubFVs=
"url": "data:;base64,ZAAAAAAAAAB7zX5e64IzHWf5/X8nkdCKpwsX0eT4/AHU77sh2+EdIXwkSEyPQ5ZP2+vRHvytn6H1hf63Wo7gPdDc59KZfz+10kjywPqQUXYOoSbeQ6cxx2dxmf2nSwSd2Wh1jA=="
},
{
"url": "http://localhost/s3-encrypted/DATASET0001/htsnexus_test_NA12878.bam.c4gh",
"url": "http://localhost/s3/DATASET0001/htsnexus_test_NA12878.bam.c4gh",
"headers": {
"Range": "bytes=124-1049147",
"accept-encoding": "gzip",
Expand All @@ -159,7 +159,7 @@ KKj6NUcJGZ2/HeqkYbxm57ZaFLP5cIHsdK+0nQubFVs=
}
},
{
"url": "http://localhost/s3-encrypted/DATASET0001/htsnexus_test_NA12878.bam.c4gh",
"url": "http://localhost/s3/DATASET0001/htsnexus_test_NA12878.bam.c4gh",
"headers": {
"Range": "bytes=2557120-2598042",
"accept-encoding": "gzip",
Expand Down
2 changes: 1 addition & 1 deletion testing/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ services:
- GRPC_PORT=50051
- GRPC_HOST=reencrypt
- APP_SERVEUNENCRYPTEDDATA=true
image: "ghcr.io/neicnordic/sensitive-data-archive:v0.3.179-download" #this is the last version that supports /s3-encrypted
image: "ghcr.io/neicnordic/sensitive-data-archive:${TAG}-download"
volumes:
- ./archive_data/4293c9a7-dc50-46db-b79a-27ddc0dad1c6:/tmp/4293c9a7-dc50-46db-b79a-27ddc0dad1c6
mem_limit: 256m
Expand Down

0 comments on commit 5e622b3

Please sign in to comment.