Skip to content

Commit 57184d5

Browse files
committed
[UPDATE] Main Python File
1 parent 990c847 commit 57184d5

File tree

2 files changed

+115
-84
lines changed

2 files changed

+115
-84
lines changed

scripts/explorer.sh

Lines changed: 52 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -5,124 +5,134 @@
55
# Email: francisco.calisto@tecnico.ulisboa.pt
66
# License: ACADEMIC & COMMERCIAL
77
# Created Date: 2024-09-22
8-
# Revised Date: 2024-09-26 # Enhanced logging and optimized Patient ID matching.
9-
# Version: 2.26
8+
# Revised Date: 2024-09-29 # Improved logging, error handling, and optimized Patient ID processing
9+
# Version: 2.28
1010
# Status: Development
1111
# Usage: ./explorer.sh
1212
# Example: ./scripts/explorer.sh
1313
# Description: Processes DICOM files, extracts Patient IDs, compares with CSV, and moves matches to the "checking" folder.
1414

15-
# Exit script immediately if any command fails
15+
# Exit script immediately if any command fails to prevent further errors
1616
set -e
1717

18-
# Configuration: Define how many files to process in one run.
19-
FILE_LIMIT=50000 # Adjust the limit for production use
18+
# Configuration: Set the maximum number of DICOM files to process in one run
19+
FILE_LIMIT=1 # You can adjust this for testing or set higher for production
2020

21-
# Define key directories and file paths
21+
# Define key directories and file paths for processing
2222
home="$HOME" # User's home directory
23-
root_dir="$home/Git" # Root directory for the project
24-
unchecked_dir="$root_dir/dataset-multimodal-breast/data/curation/unexplored" # Directory with unprocessed DICOM files
25-
checking_dir="$root_dir/dataset-multimodal-breast/data/curation/checking" # Directory to move matched DICOM files
26-
csv_file="$root_dir/dataset-multimodal-breast/data/birads/anonymized_patients_birads_curation.csv" # CSV file with patient data
27-
LOG_DIR="$root_dir/dataset-multimodal-breast/data/logs" # Log directory
28-
LOG_FILE="$LOG_DIR/explorer_$(date +'%Y%m%d_%H%M%S').log" # Log file with timestamp
29-
30-
# Ensure the log directory exists
23+
root_dir="$home/Git" # Root project directory
24+
unchecked_dir="$root_dir/dataset-multimodal-breast/data/curation/unexplored" # Unprocessed DICOM files
25+
checking_dir="$root_dir/dataset-multimodal-breast/data/curation/checking" # Folder for files with matching Patient IDs
26+
csv_file="$root_dir/dataset-multimodal-breast/data/birads/anonymized_patients_birads_curation.csv" # CSV file containing anonymized patient IDs
27+
LOG_DIR="$root_dir/dataset-multimodal-breast/data/logs" # Log directory for all logging
28+
LOG_FILE="$LOG_DIR/explorer_$(date +'%Y%m%d_%H%M%S').log" # Log file with timestamp for uniqueness
29+
30+
# Ensure the log directory exists, creating it if necessary
3131
mkdir -p "$LOG_DIR"
3232

33-
# Function to log messages to both console and log file
33+
# Function to log messages with timestamps
3434
log_message() {
3535
echo "$(date +'%Y-%m-%d %H:%M:%S') - $1" | tee -a "$LOG_FILE"
3636
}
3737

38-
# Validate that required directories and files exist
38+
# Validate if a directory or file exists, exiting the script if it doesn't
39+
# Arguments:
40+
# $1: Path to validate
41+
# $2: Friendly name to display in case of an error
3942
validate_path() {
4043
if [ ! -e "$1" ]; then
4144
log_message "Error: $2 ($1) does not exist. Exiting."
42-
exit 1
45+
exit 1 # Terminate the script if the path is invalid
4346
fi
4447
}
4548

46-
# Ensure essential paths exist
47-
validate_path "$unchecked_dir" "Unchecked folder"
49+
# Validate that all required paths (directories and CSV file) exist before starting
50+
validate_path "$unchecked_dir" "Unchecked DICOM folder"
4851
validate_path "$checking_dir" "Checking folder"
4952
validate_path "$csv_file" "CSV file"
5053

51-
# Function to extract the Patient ID from DICOM file metadata
52-
# Uses `dcmdump` and extracts the Patient ID from the DICOM tag (0010,0020)
54+
# Function to extract the Patient ID from a DICOM file
55+
# Uses dcmdump to extract Patient ID from tag (0010,0020) and cleans output
56+
# Arguments:
57+
# $1: Full path to the DICOM file
5358
extract_patient_id() {
5459
local dicom_file="$1"
60+
5561
log_message "Attempting to extract Patient ID from: $dicom_file"
5662

57-
# Extract Patient ID from DICOM metadata, cleaning up any whitespace
63+
# Extract Patient ID using dcmdump, capturing only the ID and removing extra whitespace
5864
local patient_id=$(dcmdump +P PatientID "$dicom_file" 2>/dev/null | awk -F'[][]' '{print $2}' | tr -d '[:space:]')
5965

66+
# Check if a Patient ID was extracted and log the result
6067
if [ -n "$patient_id" ]; then
6168
log_message "Successfully extracted Patient ID: $patient_id"
62-
echo "$patient_id"
69+
echo "$patient_id" # Return the Patient ID
6370
else
6471
log_message "No Patient ID found in DICOM file: $dicom_file"
65-
echo ""
72+
echo "" # Return an empty string if no ID was found
6673
fi
6774
}
6875

69-
# Function to check if a Patient ID exists in the CSV file
70-
# Uses grep to look for exact matches in the second column of the CSV
76+
# Function to check if a given Patient ID exists in the CSV file
77+
# Uses grep to search for the ID in the second column of the CSV
78+
# Arguments:
79+
# $1: The Patient ID to search for
7180
patient_id_in_csv() {
7281
local patient_id="$1"
7382

74-
# Log that we're checking for the patient ID in the CSV
75-
log_message "Checking if Patient ID: $patient_id exists in CSV..."
83+
log_message "Checking if Patient ID: $patient_id exists in the CSV file..."
7684

77-
# Use grep to search for the patient ID in the CSV
85+
# Search for the Patient ID in the CSV, ensuring an exact match in the correct column
7886
if grep -q ",${patient_id}," "$csv_file"; then
79-
log_message "Patient ID: $patient_id found in CSV"
80-
return 0 # Found
87+
log_message "Patient ID: $patient_id found in the CSV"
88+
return 0 # Return 0 (success) if the Patient ID is found
8189
else
82-
log_message "Patient ID: $patient_id not found in CSV"
83-
return 1 # Not found
90+
log_message "Patient ID: $patient_id not found in the CSV"
91+
return 1 # Return 1 (failure) if the Patient ID is not found
8492
fi
8593
}
8694

87-
# Main function to process DICOM files
95+
# Main function to process the DICOM files
8896
process_files() {
89-
local count=0 # Track number of processed files
97+
local count=0 # Initialize a counter for processed files
9098

9199
log_message "Starting to process DICOM files from: $unchecked_dir"
92100

93-
# Find and process DICOM files in the unchecked directory
101+
# Find all DICOM files in the unexplored folder, limiting to the FILE_LIMIT
94102
find "$unchecked_dir" -type f -name "*.dcm" | head -n "$FILE_LIMIT" | while IFS= read -r dicom_file; do
103+
# Stop processing if the file limit has been reached
95104
if (( count >= FILE_LIMIT )); then
96105
log_message "File limit of $FILE_LIMIT reached. Stopping."
97106
break
98107
fi
99108

100-
# Extract Patient ID from DICOM file
109+
# Extract the Patient ID from the DICOM file
101110
patient_id=$(extract_patient_id "$dicom_file")
102111

103112
# Proceed if a valid Patient ID was extracted
104113
if [ -n "$patient_id" ]; then
114+
# Check if the Patient ID exists in the CSV
105115
if patient_id_in_csv "$patient_id"; then
106-
# If the Patient ID exists in the CSV, move the DICOM file to the checking directory
116+
# Attempt to move the DICOM file if the Patient ID matches
107117
if mv "$dicom_file" "$checking_dir"; then
108118
log_message "Successfully moved $dicom_file to $checking_dir (Patient ID: $patient_id)"
109119
else
110-
log_message "Failed to move $dicom_file to $checking_dir. Skipping."
120+
log_message "Error: Failed to move $dicom_file to $checking_dir. Skipping this file."
111121
fi
112122
else
113123
log_message "No matching Patient ID in CSV for: $dicom_file"
114124
fi
115125
else
116-
log_message "Skipping $dicom_file due to missing Patient ID."
126+
log_message "Skipping $dicom_file due to missing or invalid Patient ID."
117127
fi
118128

119-
((count++)) # Increment the file counter after each processed file
129+
((count++)) # Increment the file counter
120130
done
121131

122132
log_message "Processed $count file(s) out of the $FILE_LIMIT limit."
123133
}
124134

125-
# Start processing DICOM files
135+
# Start processing the DICOM files
126136
process_files
127137

128138
log_message "DICOM file exploration completed successfully."

src/main.py

Lines changed: 63 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,39 @@
11
#!/usr/bin/env python
22

33
"""
4-
main.py: Optimized for handling large datasets with batch processing and improved logging.
4+
main.py: Optimized for handling large datasets with batch processing, enhanced logging, and memory monitoring.
55
6-
This script initializes logging, sets up paths for input/output directories,
7-
and runs the data processing pipeline by invoking the `process_directory` function
6+
This script initializes logging, sets up paths for input/output directories,
7+
and runs the data processing pipeline by invoking the `process_directory` function
88
from the `processing.processor` module. It handles DICOM files and logs the results efficiently.
9+
10+
Improvements:
11+
- Added memory monitoring before and after batch processing.
12+
- Implemented explicit garbage collection to optimize memory usage for large datasets.
13+
- Enhanced logging to trace each step in detail.
914
"""
1015

1116
__author__ = "Francisco Maria Calisto"
1217
__maintainer__ = "Francisco Maria Calisto"
1318
__email__ = "francisco.calisto@tecnico.ulisboa.pt"
1419
__license__ = "ACADEMIC & COMMERCIAL"
15-
__version__ = "0.7.1" # Incremented version after improvements
20+
__version__ = "0.7.2" # Updated version after improvements
1621
__status__ = "Development"
17-
__credits__ = ["Carlos Santiago",
18-
"Catarina Barata",
19-
"Jacinto C. Nascimento",
20-
"Diogo Araújo"]
22+
__credits__ = ["Carlos Santiago", "Catarina Barata", "Jacinto C. Nascimento", "Diogo Araújo"]
2123

2224
import logging
2325
import os
2426
import gc
25-
import psutil # Added for memory monitoring
27+
import psutil # For memory monitoring
2628
from pathlib import Path
2729
from datetime import datetime
28-
from processing.processor import process_directory
29-
30-
# Define batch size for large dataset processing
31-
BATCH_SIZE = int(os.getenv('BATCH_SIZE', 100)) # Default batch size is 100, can be set via environment variable
30+
from processing.processor import process_directory # Import the main processing function
3231

33-
# Define constant for mapping file name
32+
# Define constants for batch size and mapping file
33+
BATCH_SIZE = int(os.getenv('BATCH_SIZE', 100)) # Default batch size of 100, adjustable via environment variable
3434
MAPPING_FN = "mapping.csv"
3535

36-
# Define paths for input/output directories
36+
# Define root and folder paths for input/output/logging
3737
ROOT_DIR = Path(__file__).resolve().parents[2]
3838
SOURCE_FOLDER = ROOT_DIR / "dicom-images-breast" / "known" / "raw"
3939
OUTPUT_FOLDER = ROOT_DIR / "dataset-multimodal-breast" / "data" / "curation" / "unexplored"
@@ -42,70 +42,91 @@
4242

4343
def setup_logging(logs_folder: Path):
4444
"""
45-
Set up logging configuration to log both to a file and the console.
46-
45+
Set up logging to log both to a file and the console.
46+
4747
Args:
48-
logs_folder (Path): The directory where log files will be saved.
48+
logs_folder (Path): Directory where log files will be saved.
49+
50+
Detailed logging configuration that saves logs to both the console and a timestamped file.
4951
"""
50-
# Create the logs folder if it doesn't exist
52+
# Create logs folder if it doesn't exist
5153
logs_folder.mkdir(parents=True, exist_ok=True)
5254

53-
# Define log file with a timestamp
55+
# Create log file with timestamp to differentiate logs for each run
5456
logs_timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
5557
log_file = logs_folder / f"log_{logs_timestamp}.log"
5658

5759
# Set up logging configuration
5860
logging.basicConfig(
59-
level=logging.INFO,
61+
level=logging.INFO, # Log level set to INFO to capture general runtime events
6062
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
6163
handlers=[
62-
logging.FileHandler(log_file), # Log to file
63-
logging.StreamHandler() # Log to console
64+
logging.FileHandler(log_file), # Log messages to file
65+
logging.StreamHandler() # Log messages to console
6466
]
6567
)
66-
logging.info(f"Logging initialized. Log file: {log_file}")
6768

68-
def monitor_memory():
69+
# Log that the logging system has been initialized
70+
logging.info(f"Logging initialized. Log file created at: {log_file}")
71+
72+
def monitor_memory(stage: str):
6973
"""
70-
Monitor and log current memory usage.
74+
Monitor and log current memory usage at different stages of execution.
75+
76+
Args:
77+
stage (str): Descriptive label of the current stage of the program for logging.
78+
79+
Logs memory usage in both Resident Set Size (RSS) and Virtual Memory Size (VMS).
7180
"""
72-
process = psutil.Process(os.getpid())
73-
memory_info = process.memory_info()
74-
logging.info(f"Memory usage: RSS={memory_info.rss / (1024 * 1024)} MB, VMS={memory_info.vms / (1024 * 1024)} MB")
81+
process = psutil.Process(os.getpid()) # Get the current process ID
82+
memory_info = process.memory_info() # Get memory usage info
83+
logging.info(f"[{stage}] Memory usage: RSS={memory_info.rss / (1024 * 1024):.2f} MB, VMS={memory_info.vms / (1024 * 1024):.2f} MB")
7584

7685
def main():
7786
"""
7887
Main function for running the data processing pipeline.
79-
It processes DICOM files from the source folder and maps them to an output folder.
80-
Supports batch processing for large datasets.
88+
89+
Processes DICOM files in batches, monitors memory usage before and after processing,
90+
and ensures proper logging and error handling throughout the pipeline.
8191
"""
8292
logging.info("Starting the data processing pipeline...")
93+
94+
# Log folder paths
8395
logging.info(f"Source folder: {SOURCE_FOLDER}")
8496
logging.info(f"Output folder: {OUTPUT_FOLDER}")
8597
logging.info(f"Mapping file: {MAPPING_FILE}")
86-
logging.info(f"Batch size: {BATCH_SIZE}")
98+
logging.info(f"Batch size set to: {BATCH_SIZE}")
8799

88-
# Ensure the output folder exists
100+
# Ensure the output directory exists
89101
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
90102

91-
# Monitor initial memory usage
92-
monitor_memory()
103+
# Monitor memory usage at the start of processing
104+
monitor_memory("Initial")
93105

94-
# Execute the processing pipeline in batches
106+
# Execute the processing pipeline in batches, with error handling
95107
try:
108+
# Process the directory using the defined batch size and mapping file
109+
logging.info("Starting batch processing...")
96110
process_directory(SOURCE_FOLDER, OUTPUT_FOLDER, MAPPING_FILE, BATCH_SIZE)
97-
logging.info("Data processing pipeline completed successfully.")
111+
logging.info("Batch processing completed successfully.")
98112
except Exception as e:
113+
# Log any exceptions encountered during processing
99114
logging.error(f"An error occurred during the data processing pipeline: {e}")
100115
raise
101116
finally:
102-
# Explicit garbage collection to free memory
117+
# Perform explicit garbage collection to free memory after processing
118+
logging.info("Initiating garbage collection...")
103119
gc.collect()
104-
logging.info("Garbage collection completed.")
105-
monitor_memory() # Log memory usage after garbage collection
120+
logging.info("Garbage collection complete.")
121+
122+
# Monitor memory usage after processing and garbage collection
123+
monitor_memory("Post-GC")
106124

107125
if __name__ == "__main__":
108-
setup_logging(LOGS_FOLDER) # Initialize logging
109-
main() # Run the main pipeline function
126+
# Set up logging before starting the main process
127+
setup_logging(LOGS_FOLDER)
128+
129+
# Run the main function that starts the data processing pipeline
130+
main()
110131

111132
# End of file

0 commit comments

Comments
 (0)