diff --git a/filedownloadstat/parquet_analyzer.py b/filedownloadstat/parquet_analyzer.py index cf71b75..551dc38 100644 --- a/filedownloadstat/parquet_analyzer.py +++ b/filedownloadstat/parquet_analyzer.py @@ -1,4 +1,5 @@ import os +import pandas as pd import dask.dataframe as dd from scipy.stats import rankdata @@ -69,8 +70,14 @@ def persist_project_level_yearly_download_counts(self, ddf, project_level_yearly # Compute the result result = file_counts.compute() - # Save to JSON - result.to_json(project_level_yearly_download_counts, orient="records", lines=False) + # Convert to the desired nested JSON structure + grouped = result.groupby("accession").apply(lambda x: { + "accession": x["accession"].iloc[0], + "yearlyDownloads": x[["year", "count"]].to_dict(orient="records") + }).tolist() + + # Save to JSON file without indentation + pd.DataFrame(grouped).to_json(project_level_yearly_download_counts, orient="records", lines=False) print(f"{project_level_yearly_download_counts} file saved successfully!") diff --git a/scripts/.run_stat.sh b/scripts/.run_stat.sh index 87b4b87..a52e40f 100644 --- a/scripts/.run_stat.sh +++ b/scripts/.run_stat.sh @@ -17,4 +17,4 @@ nextflow -log $LOG_FILE run ${PIPELINE_BASE_DIR}main.nf \ --log_file $LOG_FILE \ --api_endpoint_file_downloads_per_project $API_ENDPOINT_FILE_DOWNLOADS_PER_PROJECT \ --api_endpoint_file_downloads_per_file $API_ENDPOINT_FILE_DOWNLOADS_PER_FILE \ - --api_endpoint_header $API_ENDPOINT_HEADER -resume -with-tower \ No newline at end of file + --api_endpoint_header "$API_ENDPOINT_HEADER" -resume -with-tower \ No newline at end of file