1
1
#!/usr/bin/env python
2
2
3
3
"""
4
- main.py: Optimized for handling large datasets with batch processing and improved logging .
4
+ main.py: Optimized for handling large datasets with batch processing, enhanced logging, and memory monitoring .
5
5
6
- This script initializes logging, sets up paths for input/output directories,
7
- and runs the data processing pipeline by invoking the `process_directory` function
6
+ This script initializes logging, sets up paths for input/output directories,
7
+ and runs the data processing pipeline by invoking the `process_directory` function
8
8
from the `processing.processor` module. It handles DICOM files and logs the results efficiently.
9
+
10
+ Improvements:
11
+ - Added memory monitoring before and after batch processing.
12
+ - Implemented explicit garbage collection to optimize memory usage for large datasets.
13
+ - Enhanced logging to trace each step in detail.
9
14
"""
10
15
11
16
__author__ = "Francisco Maria Calisto"
12
17
__maintainer__ = "Francisco Maria Calisto"
13
18
__email__ = "francisco.calisto@tecnico.ulisboa.pt"
14
19
__license__ = "ACADEMIC & COMMERCIAL"
15
- __version__ = "0.7.1 " # Incremented version after improvements
20
+ __version__ = "0.7.2 " # Updated version after improvements
16
21
__status__ = "Development"
17
- __credits__ = ["Carlos Santiago" ,
18
- "Catarina Barata" ,
19
- "Jacinto C. Nascimento" ,
20
- "Diogo Araújo" ]
22
+ __credits__ = ["Carlos Santiago" , "Catarina Barata" , "Jacinto C. Nascimento" , "Diogo Araújo" ]
21
23
22
24
import logging
23
25
import os
24
26
import gc
25
- import psutil # Added for memory monitoring
27
+ import psutil # For memory monitoring
26
28
from pathlib import Path
27
29
from datetime import datetime
28
- from processing .processor import process_directory
29
-
30
- # Define batch size for large dataset processing
31
- BATCH_SIZE = int (os .getenv ('BATCH_SIZE' , 100 )) # Default batch size is 100, can be set via environment variable
30
+ from processing .processor import process_directory # Import the main processing function
32
31
33
- # Define constant for mapping file name
32
+ # Define constants for batch size and mapping file
33
+ BATCH_SIZE = int (os .getenv ('BATCH_SIZE' , 100 )) # Default batch size of 100, adjustable via environment variable
34
34
MAPPING_FN = "mapping.csv"
35
35
36
- # Define paths for input/output directories
36
+ # Define root and folder paths for input/output/logging
37
37
ROOT_DIR = Path (__file__ ).resolve ().parents [2 ]
38
38
SOURCE_FOLDER = ROOT_DIR / "dicom-images-breast" / "known" / "raw"
39
39
OUTPUT_FOLDER = ROOT_DIR / "dataset-multimodal-breast" / "data" / "curation" / "unexplored"
42
42
43
43
def setup_logging (logs_folder : Path ):
44
44
"""
45
- Set up logging configuration to log both to a file and the console.
46
-
45
+ Set up logging to log both to a file and the console.
46
+
47
47
Args:
48
- logs_folder (Path): The directory where log files will be saved.
48
+ logs_folder (Path): Directory where log files will be saved.
49
+
50
+ Detailed logging configuration that saves logs to both the console and a timestamped file.
49
51
"""
50
- # Create the logs folder if it doesn't exist
52
+ # Create logs folder if it doesn't exist
51
53
logs_folder .mkdir (parents = True , exist_ok = True )
52
54
53
- # Define log file with a timestamp
55
+ # Create log file with timestamp to differentiate logs for each run
54
56
logs_timestamp = datetime .now ().strftime ('%Y%m%d%H%M%S' )
55
57
log_file = logs_folder / f"log_{ logs_timestamp } .log"
56
58
57
59
# Set up logging configuration
58
60
logging .basicConfig (
59
- level = logging .INFO ,
61
+ level = logging .INFO , # Log level set to INFO to capture general runtime events
60
62
format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' ,
61
63
handlers = [
62
- logging .FileHandler (log_file ), # Log to file
63
- logging .StreamHandler () # Log to console
64
+ logging .FileHandler (log_file ), # Log messages to file
65
+ logging .StreamHandler () # Log messages to console
64
66
]
65
67
)
66
- logging .info (f"Logging initialized. Log file: { log_file } " )
67
68
68
- def monitor_memory ():
69
+ # Log that the logging system has been initialized
70
+ logging .info (f"Logging initialized. Log file created at: { log_file } " )
71
+
72
+ def monitor_memory (stage : str ):
69
73
"""
70
- Monitor and log current memory usage.
74
+ Monitor and log current memory usage at different stages of execution.
75
+
76
+ Args:
77
+ stage (str): Descriptive label of the current stage of the program for logging.
78
+
79
+ Logs memory usage in both Resident Set Size (RSS) and Virtual Memory Size (VMS).
71
80
"""
72
- process = psutil .Process (os .getpid ())
73
- memory_info = process .memory_info ()
74
- logging .info (f"Memory usage: RSS={ memory_info .rss / (1024 * 1024 )} MB, VMS={ memory_info .vms / (1024 * 1024 )} MB" )
81
+ process = psutil .Process (os .getpid ()) # Get the current process ID
82
+ memory_info = process .memory_info () # Get memory usage info
83
+ logging .info (f"[ { stage } ] Memory usage: RSS={ memory_info .rss / (1024 * 1024 ):.2f } MB, VMS={ memory_info .vms / (1024 * 1024 ):.2f } MB" )
75
84
76
85
def main ():
77
86
"""
78
87
Main function for running the data processing pipeline.
79
- It processes DICOM files from the source folder and maps them to an output folder.
80
- Supports batch processing for large datasets.
88
+
89
+ Processes DICOM files in batches, monitors memory usage before and after processing,
90
+ and ensures proper logging and error handling throughout the pipeline.
81
91
"""
82
92
logging .info ("Starting the data processing pipeline..." )
93
+
94
+ # Log folder paths
83
95
logging .info (f"Source folder: { SOURCE_FOLDER } " )
84
96
logging .info (f"Output folder: { OUTPUT_FOLDER } " )
85
97
logging .info (f"Mapping file: { MAPPING_FILE } " )
86
- logging .info (f"Batch size: { BATCH_SIZE } " )
98
+ logging .info (f"Batch size set to : { BATCH_SIZE } " )
87
99
88
- # Ensure the output folder exists
100
+ # Ensure the output directory exists
89
101
OUTPUT_FOLDER .mkdir (parents = True , exist_ok = True )
90
102
91
- # Monitor initial memory usage
92
- monitor_memory ()
103
+ # Monitor memory usage at the start of processing
104
+ monitor_memory ("Initial" )
93
105
94
- # Execute the processing pipeline in batches
106
+ # Execute the processing pipeline in batches, with error handling
95
107
try :
108
+ # Process the directory using the defined batch size and mapping file
109
+ logging .info ("Starting batch processing..." )
96
110
process_directory (SOURCE_FOLDER , OUTPUT_FOLDER , MAPPING_FILE , BATCH_SIZE )
97
- logging .info ("Data processing pipeline completed successfully." )
111
+ logging .info ("Batch processing completed successfully." )
98
112
except Exception as e :
113
+ # Log any exceptions encountered during processing
99
114
logging .error (f"An error occurred during the data processing pipeline: { e } " )
100
115
raise
101
116
finally :
102
- # Explicit garbage collection to free memory
117
+ # Perform explicit garbage collection to free memory after processing
118
+ logging .info ("Initiating garbage collection..." )
103
119
gc .collect ()
104
- logging .info ("Garbage collection completed." )
105
- monitor_memory () # Log memory usage after garbage collection
120
+ logging .info ("Garbage collection complete." )
121
+
122
+ # Monitor memory usage after processing and garbage collection
123
+ monitor_memory ("Post-GC" )
106
124
107
125
if __name__ == "__main__" :
108
- setup_logging (LOGS_FOLDER ) # Initialize logging
109
- main () # Run the main pipeline function
126
+ # Set up logging before starting the main process
127
+ setup_logging (LOGS_FOLDER )
128
+
129
+ # Run the main function that starts the data processing pipeline
130
+ main ()
110
131
111
132
# End of file
0 commit comments