Skip to content

Commit

Permalink
rank data added
Browse files Browse the repository at this point in the history
  • Loading branch information
sureshhewabi committed Feb 17, 2025
1 parent d01bb80 commit 66033d1
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 1 deletion.
3 changes: 2 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ dependencies:
- dask>=2023.1.0
- plotly
- mkdocs-material
- yaml
- yaml
- scipy
7 changes: 7 additions & 0 deletions filedownloadstat/parquet_analyzer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import dask.dataframe as dd
from scipy.stats import rankdata

class ParquetAnalyzer:
def __init__(self):
Expand Down Expand Up @@ -32,6 +33,12 @@ def persist_project_level_download_counts(self, ddf, project_level_download_coun
# Compute the result
result = project_level_counts.compute()

# Calculate percentile rank for the 'count' column
result["percentile"] = (rankdata(result["count"], method="average") / len(result) * 100).astype(int)

# Sort by count in descending order
result = result.sort_values(by="count", ascending=False)

# Save to JSON
result.to_json(project_level_download_counts, orient="records", lines=False)

Expand Down

0 comments on commit 66033d1

Please sign in to comment.