[UPDATE] Main Python File

FMCalisto · FMCalisto · commit 57184d52ac77 · 2024-09-28T13:04:53.000+01:00
diff --git a/scripts/explorer.sh b/scripts/explorer.sh
@@ -5,124 +5,134 @@
 # Email: francisco.calisto@tecnico.ulisboa.pt
 # License: ACADEMIC & COMMERCIAL
 # Created Date: 2024-09-22
-# Revised Date: 2024-09-26  # Enhanced logging and optimized Patient ID matching.
-# Version: 2.26
+# Revised Date: 2024-09-29  # Improved logging, error handling, and optimized Patient ID processing
+# Version: 2.28
 # Status: Development
 # Usage: ./explorer.sh
 # Example: ./scripts/explorer.sh
 # Description: Processes DICOM files, extracts Patient IDs, compares with CSV, and moves matches to the "checking" folder.
 
-# Exit script immediately if any command fails
+# Exit script immediately if any command fails to prevent further errors
 set -e
 
-# Configuration: Define how many files to process in one run.
-FILE_LIMIT=50000  # Adjust the limit for production use
+# Configuration: Set the maximum number of DICOM files to process in one run
+FILE_LIMIT=1  # You can adjust this for testing or set higher for production
 
-# Define key directories and file paths
+# Define key directories and file paths for processing
 home="$HOME"  # User's home directory
-root_dir="$home/Git"  # Root directory for the project
-unchecked_dir="$root_dir/dataset-multimodal-breast/data/curation/unexplored"  # Directory with unprocessed DICOM files
-checking_dir="$root_dir/dataset-multimodal-breast/data/curation/checking"  # Directory to move matched DICOM files
-csv_file="$root_dir/dataset-multimodal-breast/data/birads/anonymized_patients_birads_curation.csv"  # CSV file with patient data
-LOG_DIR="$root_dir/dataset-multimodal-breast/data/logs"  # Log directory
-LOG_FILE="$LOG_DIR/explorer_$(date +'%Y%m%d_%H%M%S').log"  # Log file with timestamp
-
-# Ensure the log directory exists
+root_dir="$home/Git"  # Root project directory
+unchecked_dir="$root_dir/dataset-multimodal-breast/data/curation/unexplored"  # Unprocessed DICOM files
+checking_dir="$root_dir/dataset-multimodal-breast/data/curation/checking"  # Folder for files with matching Patient IDs
+csv_file="$root_dir/dataset-multimodal-breast/data/birads/anonymized_patients_birads_curation.csv"  # CSV file containing anonymized patient IDs
+LOG_DIR="$root_dir/dataset-multimodal-breast/data/logs"  # Log directory for all logging
+LOG_FILE="$LOG_DIR/explorer_$(date +'%Y%m%d_%H%M%S').log"  # Log file with timestamp for uniqueness
+
+# Ensure the log directory exists, creating it if necessary
 mkdir -p "$LOG_DIR"
 
-# Function to log messages to both console and log file
+# Function to log messages with timestamps
 log_message() {
   echo "$(date +'%Y-%m-%d %H:%M:%S') - $1" | tee -a "$LOG_FILE"
 }
 
-# Validate that required directories and files exist
+# Validate if a directory or file exists, exiting the script if it doesn't
+# Arguments:
+#   $1: Path to validate
+#   $2: Friendly name to display in case of an error
 validate_path() {
   if [ ! -e "$1" ]; then
     log_message "Error: $2 ($1) does not exist. Exiting."
-    exit 1
+    exit 1  # Terminate the script if the path is invalid
   fi
 }
 
-# Ensure essential paths exist
-validate_path "$unchecked_dir" "Unchecked folder"
+# Validate that all required paths (directories and CSV file) exist before starting
+validate_path "$unchecked_dir" "Unchecked DICOM folder"
 validate_path "$checking_dir" "Checking folder"
 validate_path "$csv_file" "CSV file"
 
-# Function to extract the Patient ID from DICOM file metadata
-# Uses `dcmdump` and extracts the Patient ID from the DICOM tag (0010,0020)
+# Function to extract the Patient ID from a DICOM file
+# Uses dcmdump to extract Patient ID from tag (0010,0020) and cleans output
+# Arguments:
+#   $1: Full path to the DICOM file
 extract_patient_id() {
   local dicom_file="$1"
+  
   log_message "Attempting to extract Patient ID from: $dicom_file"
   
-  # Extract Patient ID from DICOM metadata, cleaning up any whitespace
+  # Extract Patient ID using dcmdump, capturing only the ID and removing extra whitespace
   local patient_id=$(dcmdump +P PatientID "$dicom_file" 2>/dev/null | awk -F'[][]' '{print $2}' | tr -d '[:space:]')
   
+  # Check if a Patient ID was extracted and log the result
   if [ -n "$patient_id" ]; then
     log_message "Successfully extracted Patient ID: $patient_id"
-    echo "$patient_id"
+    echo "$patient_id"  # Return the Patient ID
   else
     log_message "No Patient ID found in DICOM file: $dicom_file"
-    echo ""
+    echo ""  # Return an empty string if no ID was found
   fi
 }
 
-# Function to check if a Patient ID exists in the CSV file
-# Uses grep to look for exact matches in the second column of the CSV
+# Function to check if a given Patient ID exists in the CSV file
+# Uses grep to search for the ID in the second column of the CSV
+# Arguments:
+#   $1: The Patient ID to search for
 patient_id_in_csv() {
   local patient_id="$1"
   
-  # Log that we're checking for the patient ID in the CSV
-  log_message "Checking if Patient ID: $patient_id exists in CSV..."
+  log_message "Checking if Patient ID: $patient_id exists in the CSV file..."
   
-  # Use grep to search for the patient ID in the CSV
+  # Search for the Patient ID in the CSV, ensuring an exact match in the correct column
   if grep -q ",${patient_id}," "$csv_file"; then
-    log_message "Patient ID: $patient_id found in CSV"
-    return 0  # Found
+    log_message "Patient ID: $patient_id found in the CSV"
+    return 0  # Return 0 (success) if the Patient ID is found
   else
-    log_message "Patient ID: $patient_id not found in CSV"
-    return 1  # Not found
+    log_message "Patient ID: $patient_id not found in the CSV"
+    return 1  # Return 1 (failure) if the Patient ID is not found
   fi
 }
 
-# Main function to process DICOM files
+# Main function to process the DICOM files
 process_files() {
-  local count=0  # Track number of processed files
+  local count=0  # Initialize a counter for processed files
 
   log_message "Starting to process DICOM files from: $unchecked_dir"
   
-  # Find and process DICOM files in the unchecked directory
+  # Find all DICOM files in the unexplored folder, limiting to the FILE_LIMIT
   find "$unchecked_dir" -type f -name "*.dcm" | head -n "$FILE_LIMIT" | while IFS= read -r dicom_file; do
+    # Stop processing if the file limit has been reached
     if (( count >= FILE_LIMIT )); then
       log_message "File limit of $FILE_LIMIT reached. Stopping."
       break
     fi
 
-    # Extract Patient ID from DICOM file
+    # Extract the Patient ID from the DICOM file
     patient_id=$(extract_patient_id "$dicom_file")
     
     # Proceed if a valid Patient ID was extracted
     if [ -n "$patient_id" ]; then
+      # Check if the Patient ID exists in the CSV
       if patient_id_in_csv "$patient_id"; then
-        # If the Patient ID exists in the CSV, move the DICOM file to the checking directory
+        # Attempt to move the DICOM file if the Patient ID matches
         if mv "$dicom_file" "$checking_dir"; then
           log_message "Successfully moved $dicom_file to $checking_dir (Patient ID: $patient_id)"
         else
-          log_message "Failed to move $dicom_file to $checking_dir. Skipping."
+          log_message "Error: Failed to move $dicom_file to $checking_dir. Skipping this file."
         fi
       else
         log_message "No matching Patient ID in CSV for: $dicom_file"
       fi
     else
-      log_message "Skipping $dicom_file due to missing Patient ID."
+      log_message "Skipping $dicom_file due to missing or invalid Patient ID."
     fi
 
-    ((count++))  # Increment the file counter after each processed file
+    ((count++))  # Increment the file counter
   done
 
   log_message "Processed $count file(s) out of the $FILE_LIMIT limit."
 }
 
-# Start processing DICOM files
+# Start processing the DICOM files
 process_files
 
 log_message "DICOM file exploration completed successfully."
diff --git a/src/main.py b/src/main.py
@@ -1,39 +1,39 @@
 #!/usr/bin/env python
 
 """
-main.py: Optimized for handling large datasets with batch processing and improved logging.
+main.py: Optimized for handling large datasets with batch processing, enhanced logging, and memory monitoring.
 
-This script initializes logging, sets up paths for input/output directories, 
-and runs the data processing pipeline by invoking the `process_directory` function 
+This script initializes logging, sets up paths for input/output directories,
+and runs the data processing pipeline by invoking the `process_directory` function
 from the `processing.processor` module. It handles DICOM files and logs the results efficiently.
+
+Improvements:
+- Added memory monitoring before and after batch processing.
+- Implemented explicit garbage collection to optimize memory usage for large datasets.
+- Enhanced logging to trace each step in detail.
 """
 
 __author__ = "Francisco Maria Calisto"
 __maintainer__ = "Francisco Maria Calisto"
 __email__ = "francisco.calisto@tecnico.ulisboa.pt"
 __license__ = "ACADEMIC & COMMERCIAL"
-__version__ = "0.7.1"  # Incremented version after improvements
+__version__ = "0.7.2"  # Updated version after improvements
 __status__ = "Development"
-__credits__ = ["Carlos Santiago",
-               "Catarina Barata",
-               "Jacinto C. Nascimento",
-               "Diogo Araújo"]
+__credits__ = ["Carlos Santiago", "Catarina Barata", "Jacinto C. Nascimento", "Diogo Araújo"]
 
 import logging
 import os
 import gc
-import psutil  # Added for memory monitoring
+import psutil  # For memory monitoring
 from pathlib import Path
 from datetime import datetime
-from processing.processor import process_directory
-
-# Define batch size for large dataset processing
-BATCH_SIZE = int(os.getenv('BATCH_SIZE', 100))  # Default batch size is 100, can be set via environment variable
+from processing.processor import process_directory  # Import the main processing function
 
-# Define constant for mapping file name
+# Define constants for batch size and mapping file
+BATCH_SIZE = int(os.getenv('BATCH_SIZE', 100))  # Default batch size of 100, adjustable via environment variable
 MAPPING_FN = "mapping.csv"
 
-# Define paths for input/output directories
+# Define root and folder paths for input/output/logging
 ROOT_DIR = Path(__file__).resolve().parents[2]
 SOURCE_FOLDER = ROOT_DIR / "dicom-images-breast" / "known" / "raw"
 OUTPUT_FOLDER = ROOT_DIR / "dataset-multimodal-breast" / "data" / "curation" / "unexplored"
@@ -42,70 +42,91 @@
 
 def setup_logging(logs_folder: Path):
   """
-  Set up logging configuration to log both to a file and the console.
-
+  Set up logging to log both to a file and the console.
+  
   Args:
-    logs_folder (Path): The directory where log files will be saved.
+    logs_folder (Path): Directory where log files will be saved.
+  
+  Detailed logging configuration that saves logs to both the console and a timestamped file.
   """
-  # Create the logs folder if it doesn't exist
+  # Create logs folder if it doesn't exist
   logs_folder.mkdir(parents=True, exist_ok=True)
 
-  # Define log file with a timestamp
+  # Create log file with timestamp to differentiate logs for each run
   logs_timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
   log_file = logs_folder / f"log_{logs_timestamp}.log"
 
   # Set up logging configuration
   logging.basicConfig(
-    level=logging.INFO,
+    level=logging.INFO,  # Log level set to INFO to capture general runtime events
     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
     handlers=[
-      logging.FileHandler(log_file),  # Log to file
-      logging.StreamHandler()         # Log to console
+      logging.FileHandler(log_file),  # Log messages to file
+      logging.StreamHandler()         # Log messages to console
     ]
   )
-  logging.info(f"Logging initialized. Log file: {log_file}")
 
-def monitor_memory():
+  # Log that the logging system has been initialized
+  logging.info(f"Logging initialized. Log file created at: {log_file}")
+
+def monitor_memory(stage: str):
   """
-  Monitor and log current memory usage.
+  Monitor and log current memory usage at different stages of execution.
+  
+  Args:
+    stage (str): Descriptive label of the current stage of the program for logging.
+  
+  Logs memory usage in both Resident Set Size (RSS) and Virtual Memory Size (VMS).
   """
-  process = psutil.Process(os.getpid())
-  memory_info = process.memory_info()
-  logging.info(f"Memory usage: RSS={memory_info.rss / (1024 * 1024)} MB, VMS={memory_info.vms / (1024 * 1024)} MB")
+  process = psutil.Process(os.getpid())  # Get the current process ID
+  memory_info = process.memory_info()  # Get memory usage info
+  logging.info(f"[{stage}] Memory usage: RSS={memory_info.rss / (1024 * 1024):.2f} MB, VMS={memory_info.vms / (1024 * 1024):.2f} MB")
 
 def main():
   """
   Main function for running the data processing pipeline.
-  It processes DICOM files from the source folder and maps them to an output folder.
-  Supports batch processing for large datasets.
+  
+  Processes DICOM files in batches, monitors memory usage before and after processing,
+  and ensures proper logging and error handling throughout the pipeline.
   """
   logging.info("Starting the data processing pipeline...")
+
+  # Log folder paths
   logging.info(f"Source folder: {SOURCE_FOLDER}")
   logging.info(f"Output folder: {OUTPUT_FOLDER}")
   logging.info(f"Mapping file: {MAPPING_FILE}")
-  logging.info(f"Batch size: {BATCH_SIZE}")
+  logging.info(f"Batch size set to: {BATCH_SIZE}")
 
-  # Ensure the output folder exists
+  # Ensure the output directory exists
   OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
 
-  # Monitor initial memory usage
-  monitor_memory()
+  # Monitor memory usage at the start of processing
+  monitor_memory("Initial")
 
-  # Execute the processing pipeline in batches
+  # Execute the processing pipeline in batches, with error handling
   try:
+    # Process the directory using the defined batch size and mapping file
+    logging.info("Starting batch processing...")
     process_directory(SOURCE_FOLDER, OUTPUT_FOLDER, MAPPING_FILE, BATCH_SIZE)
-    logging.info("Data processing pipeline completed successfully.")
+    logging.info("Batch processing completed successfully.")
   except Exception as e:
+    # Log any exceptions encountered during processing
     logging.error(f"An error occurred during the data processing pipeline: {e}")
     raise
   finally:
-    # Explicit garbage collection to free memory
+    # Perform explicit garbage collection to free memory after processing
+    logging.info("Initiating garbage collection...")
     gc.collect()
-    logging.info("Garbage collection completed.")
-    monitor_memory()  # Log memory usage after garbage collection
+    logging.info("Garbage collection complete.")
+
+    # Monitor memory usage after processing and garbage collection
+    monitor_memory("Post-GC")
 
 if __name__ == "__main__":
-  setup_logging(LOGS_FOLDER)  # Initialize logging
-  main()                      # Run the main pipeline function
+  # Set up logging before starting the main process
+  setup_logging(LOGS_FOLDER)
+
+  # Run the main function that starts the data processing pipeline
+  main()
 
 # End of file