From ccb21562849fe092d7b6e557265e89f0006a0460 Mon Sep 17 00:00:00 2001 From: Vimal Joseph Date: Wed, 22 May 2024 23:17:01 +0530 Subject: [PATCH 1/5] Add features to avoid sampling and exceed quota issues --- analytics_reporter.py | 76 +++++++++++++++---- ga_data_fetcher.py | 24 +++++- merge_reports.py | 72 ++++++++++++++++++ ua_backup.py | 166 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 321 insertions(+), 17 deletions(-) create mode 100644 merge_reports.py create mode 100644 ua_backup.py diff --git a/analytics_reporter.py b/analytics_reporter.py index 91c8b61..8fd82d2 100644 --- a/analytics_reporter.py +++ b/analytics_reporter.py @@ -38,16 +38,50 @@ def signal_handler(sig, frame): logging.info("You pressed Ctrl+C! Waiting for the current request to complete...") interrupted = True -def construct_output_file(property_name, view_id, report_id, report_name): +def construct_log_file(view_id, report_name, sequence=None, log_type="progress"): + """Construct the log file name based on provided parameters.""" + log_file_name = f"{view_id}_{log_type}.log" + if sequence: + log_file_name = f"{view_id}_{log_type}.log" + return log_file_name + +def log_sampling_info(output_dir, view_id, report_name, sampling_info, sequence=None): + """Log sampling information to a log file.""" + sampling_log_file = os.path.join(output_dir, construct_log_file(view_id, report_name, sequence, "sampling")) + with open(sampling_log_file, 'a') as log_file: + log_file.write(f"Sampling Info for View ID {view_id}, Report: {report_name}:\n") + log_file.write(f" Is Sampled: {sampling_info['is_sampled']}\n") + log_file.write(f" Samples Read Counts: {sampling_info['samples_read_counts']}\n") + log_file.write(f" Sampling Space Sizes: {sampling_info['sampling_space_sizes']}\n") + log_file.write("\n") + +def construct_output_file(property_name, view_id, report_id, report_name, sequence=None): """Construct the output file name based on provided parameters.""" property_name_clean = clean_name(property_name) if property_name else "" report_name_clean = clean_name(report_name) + + output_dir = f"output/{view_id}_{property_name_clean}" + os.makedirs(output_dir, exist_ok=True) + if property_name_clean: - return f"{property_name_clean}_{view_id}_{report_id}_{report_name_clean}_report.csv" + if sequence: + return f"{output_dir}/{property_name_clean}_{view_id}_{report_id}_{report_name_clean}_report_{sequence}.csv" + else: + return f"{output_dir}/{property_name_clean}_{view_id}_{report_id}_{report_name_clean}_report.csv" else: - return f"{view_id}_{report_id}_{report_name_clean}_report.csv" + if sequence: + return f"{output_dir}/{view_id}_{report_id}_{report_name_clean}_report_{sequence}.csv" + else: + return f"{output_dir}/{view_id}_{report_id}_{report_name_clean}_report.csv" + +def log_quota_exceeded(view_id): + """Log the date and time when quota is exceeded.""" + quota_log_file = f"quota_exceeded.log" + with open(quota_log_file, 'a') as log_file: + log_file.write(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") + -def generate_report(report_config, start_date, end_date, api_key, view_id, output_file): +def generate_report(report_config, start_date, end_date, api_key, view_id, report_name, output_file, sequence=None): """Generate report based on provided configuration.""" global interrupted success = False @@ -59,11 +93,12 @@ def generate_report(report_config, start_date, end_date, api_key, view_id, outpu dimensions = report_config['dimensions'] metrics = report_config['metrics'] - page_size = report_config.get('page_size', 1000) # Default to 1000 if not specified + page_size = report_config.get('page_size', 5000) # Default to 5000 if not specified sampling_level = report_config.get('sampling_level', 'DEFAULT') # Default to 'DEFAULT' if not specified metrics_filter = report_config.get('metrics_filter', False) - progress_file = f"{view_id}_progress.log" + output_dir = os.path.dirname(output_file) + progress_file = os.path.join(output_dir, construct_log_file(view_id, report_name, sequence, "progress")) progress_data = load_progress(progress_file) total_records_downloaded = 0 @@ -95,7 +130,17 @@ def generate_report(report_config, start_date, end_date, api_key, view_id, outpu break try: - data, next_page_token = get_data(api_key, view_id, dimensions, metrics, start_date, end_date, format_date, page_size, next_page_token, sampling_level, metrics_filter) + data, next_page_token, sampling_info, quota_exceeded = get_data(api_key, view_id, dimensions, metrics, start_date, end_date, format_date, page_size, next_page_token, sampling_level, metrics_filter) + if quota_exceeded: + log_quota_exceeded(view_id) + break + if sampling_info['is_sampled']: + logging.info("Data is sampled.") + logging.info(f"Sampling Read Counts: {sampling_info['samples_read_counts']}") + logging.info(f"Sample Space Sizes:, {sampling_info['sampling_space_sizes']}") + log_sampling_info(output_dir, view_id, report_name, sampling_info, sequence) + else: + logging.info("Data is not sampled.") if not data: logging.info(f"No data available to download for {output_file}") break @@ -116,6 +161,10 @@ def generate_report(report_config, start_date, end_date, api_key, view_id, outpu if not next_page_token or interrupted: break + except ValueError as e: + logging.error(f"ValueError: {e}") + logging.error(traceback.format_exc()) + break # Exit the loop on any error and save progress except Exception as e: logging.error(f"An error occurred while fetching data: {e}") logging.error(traceback.format_exc()) @@ -130,16 +179,16 @@ def generate_report(report_config, start_date, end_date, api_key, view_id, outpu if success and data: logging.info(f"Data available in CSV file: {output_file}") -def generate_all_reports(report_configs, start_date, end_date, api_key, view_id, property_name): +def generate_all_reports(report_configs, start_date, end_date, api_key, view_id, property_name, sequence=None): """Generate all reports specified in the configuration.""" property_name_clean = clean_name(property_name) if property_name else "" for report_config in report_configs: report_name = report_config['name'] - output_file = construct_output_file(property_name, view_id, report_config['id'], report_name) + output_file = construct_output_file(property_name, view_id, report_config['id'], report_name, sequence) logging.info(f"Generating report for {report_name}") - generate_report(report_config, start_date, end_date, api_key, view_id, output_file) + generate_report(report_config, start_date, end_date, api_key, view_id, report_name, output_file, sequence) if interrupted: logging.info("Interrupted! Stopping further report generation.") break @@ -153,6 +202,7 @@ def main(): parser.add_argument('-s', '--start', type=str, required=True, help='Start date (YYYY-MM-DD)') parser.add_argument('-e', '--end', type=str, required=True, help='End date (YYYY-MM-DD)') parser.add_argument('--settings', type=str, help='Path to settings YAML file') + parser.add_argument('--sequence', type=str, help='Optional sequence prefix for the output file name') args = parser.parse_args() settings_file = args.settings if args.settings else "settings.yml" @@ -172,12 +222,12 @@ def main(): logging.error(f"Report configuration for ID {args.report_id} not found.") return report_name = report_config['name'] - output_file = construct_output_file(property_name, view_id, args.report_id, report_name) + output_file = construct_output_file(property_name, view_id, args.report_id, report_name, args.sequence) logging.info(f"Generate report for {report_name}") - generate_report(report_config, args.start, args.end, api_key, view_id, output_file) + generate_report(report_config, start_date, end_date, api_key, view_id, report_name, output_file, sequence) else: - generate_all_reports(report_configs['reports'], args.start, args.end, api_key, view_id, property_name) + generate_all_reports(report_configs['reports'], args.start, args.end, api_key, view_id, property_name, args.sequence) if __name__ == "__main__": main() diff --git a/ga_data_fetcher.py b/ga_data_fetcher.py index 9d0c46a..9c05e67 100644 --- a/ga_data_fetcher.py +++ b/ga_data_fetcher.py @@ -10,7 +10,7 @@ from oauth2client.service_account import ServiceAccountCredentials from googleapiclient.errors import HttpError -def get_data(api_key, view_id, dimensions, metrics, start_date, end_date, date_formatter, page_size=1000, next_page_token=None, sample_size='DEFAULT', metric_filter=False): +def get_data(api_key, view_id, dimensions, metrics, start_date, end_date, date_formatter, page_size=5000, next_page_token=None, sample_size='DEFAULT', metric_filter=False): # Initialize service credentials = ServiceAccountCredentials.from_json_keyfile_name(api_key) service = build('analyticsreporting', 'v4', credentials=credentials) @@ -45,6 +45,14 @@ def get_data(api_key, view_id, dimensions, metrics, start_date, end_date, date_f column_header_entries = report['columnHeader']['dimensions'] + \ [entry['name'] for entry in report['columnHeader']['metricHeader']['metricHeaderEntries']] rows = report.get('data', {}).get('rows', []) + samples_read_counts = report.get('data', {}).get('samplesReadCounts', []) + sampling_space_sizes = report.get('data', {}).get('samplingSpaceSizes', []) + is_sampled = bool(samples_read_counts and sampling_space_sizes) + sampling_info = { + 'is_sampled': is_sampled, + 'samples_read_counts': samples_read_counts, + 'sampling_space_sizes': sampling_space_sizes + } for row in rows: formatted_row = {} @@ -61,8 +69,16 @@ def get_data(api_key, view_id, dimensions, metrics, start_date, end_date, date_f # Get the next page token, if any new_next_page_token = report.get('nextPageToken', None) - return formatted_data, new_next_page_token + return formatted_data, new_next_page_token, sampling_info, False except HttpError as error: - print(f"Error fetching data: {error}") - return [], None + if error.resp.status == 429: + logging.error("Quota Error: Quota exceeded. Please try again later.") + return [], None, {'is_sampled': False, 'samples_read_counts': [], 'sampling_space_sizes': []}, True + else: + logging.error(f"Error fetching data: {error}") + return [], None, {'is_sampled': False, 'samples_read_counts': [], 'sampling_space_sizes': []}, False + except Exception as e: + logging.error(f"An error occurred: {e}") + return [], None, {'is_sampled': False, 'samples_read_counts': [], 'sampling_space_sizes': []}, False + diff --git a/merge_reports.py b/merge_reports.py new file mode 100644 index 0000000..5e33dbe --- /dev/null +++ b/merge_reports.py @@ -0,0 +1,72 @@ +import os +import pandas as pd +from glob import glob +import argparse +import re + +def merge_report_files(input_dir, output_dir): + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Dictionary to hold report file information + report_files = {} + + # Regex to match report files + report_file_pattern = re.compile(r'^(.*)_(\d+)_(\d+)_([a-zA-Z-]+)_report_(\d+)\.csv$') + + # Find all matching files + for filepath in glob(os.path.join(input_dir, '*.csv')): + filename = os.path.basename(filepath) + match = report_file_pattern.match(filename) + if match: + base_name = match.group(1) + view_id = match.group(2) + report_id = match.group(3) + report_name = match.group(4) + sequence = int(match.group(5)) + + key = f"{base_name}_{view_id}_{report_id}_{report_name}" + if key not in report_files: + report_files[key] = [] + report_files[key].append((sequence, filepath)) + else: + print(f"Skipping file {filename}: Filename does not match expected pattern") + + # Merge files for each report + with open(os.path.join(output_dir, 'all_reports.log'), 'w') as log_file: + for key, files in report_files.items(): + files.sort() # Sort files by sequence number + output_file = os.path.join(output_dir, f"{key}_report_full.csv") + total_records = 0 + start_date = None + end_date = None + num_files_merged = len(files) + + merged_df = pd.DataFrame() + for seq, filepath in files: + df = pd.read_csv(filepath) + if 'ga:date' in df.columns: + if start_date is None: + start_date = df['ga:date'].iloc[0] + end_date = df['ga:date'].iloc[-1] + num_records = len(df) - 1 if not merged_df.empty else len(df) # Subtract header only if it's not the first file + total_records += num_records + merged_df = pd.concat([merged_df, df], ignore_index=True) if merged_df.empty else pd.concat([merged_df, df.iloc[1:]], ignore_index=True) + + # Write the merged dataframe to CSV + merged_df.to_csv(output_file, index=False) + + log_file.write(f"{output_file},{start_date or ''},{end_date or ''},{num_files_merged},{total_records}\n") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Merge report files into a single file for each report.') + parser.add_argument('input_dir', type=str, help='Path to the input directory containing report files.') + parser.add_argument('output_dir', type=str, nargs='?', default='.', help='Path to the output directory where merged files will be stored. Defaults to current directory.') + + args = parser.parse_args() + + input_dir = args.input_dir + output_dir = args.output_dir + + merge_report_files(input_dir, output_dir) + diff --git a/ua_backup.py b/ua_backup.py new file mode 100644 index 0000000..e82d223 --- /dev/null +++ b/ua_backup.py @@ -0,0 +1,166 @@ +import argparse +import subprocess +import datetime +import signal +import os +import logging + +# Initialize logger +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +interrupted = False +log_file = "ua-backup-execution.log" + +def parse_arguments(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description='Wrapper script for analytics-reporter.py') + parser.add_argument('--start', type=str, required=True, help='Start date (YYYY-MM-DD)') + parser.add_argument('--end', type=str, required=True, help='End date (YYYY-MM-DD)') + parser.add_argument('--settings', type=str, help='Path to settings YAML file') + parser.add_argument('--report_id', type=int, help='ID of the report to generate') + parser.add_argument('--report_level', type=str, choices=['day', 'week', 'month', 'year'], required=True, help='Report level to split date range') + args = parser.parse_args() + return args + +def date_range(start_date, end_date, delta): + """Generate a range of dates.""" + current_date = start_date + while current_date <= end_date: + yield current_date + current_date += delta + +def split_date_range(start, end, report_level): + """Split the date range based on the report level.""" + start_date = datetime.datetime.strptime(start, "%Y-%m-%d") + end_date = datetime.datetime.strptime(end, "%Y-%m-%d") + + periods = [] + if report_level == 'day': + delta = datetime.timedelta(days=1) + for current_start in date_range(start_date, end_date, delta): + current_end = current_start + delta - datetime.timedelta(days=1) + periods.append((current_start.strftime("%Y-%m-%d"), current_end.strftime("%Y-%m-%d"))) + elif report_level == 'week': + delta = datetime.timedelta(weeks=1) + for current_start in date_range(start_date, end_date, delta): + current_end = current_start + delta - datetime.timedelta(days=1) + if current_end > end_date: + current_end = end_date + periods.append((current_start.strftime("%Y-%m-%d"), current_end.strftime("%Y-%m-%d"))) + elif report_level == 'month': + def add_months(dt, months): + month = dt.month - 1 + months + year = dt.year + month // 12 + month = month % 12 + 1 + day = min(dt.day, [31, 29 if year % 4 == 0 and not year % 100 == 0 or year % 400 == 0 else 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31][month - 1]) + return dt.replace(year=year, month=month, day=day) + current_start = start_date + while current_start <= end_date: + current_end = add_months(current_start, 1) - datetime.timedelta(days=1) + if current_end > end_date: + current_end = end_date + periods.append((current_start.strftime("%Y-%m-%d"), current_end.strftime("%Y-%m-%d"))) + current_start = current_end + datetime.timedelta(days=1) + elif report_level == 'year': + delta = datetime.timedelta(days=365) # Approximation + for current_start in date_range(start_date, end_date, delta): + current_end = current_start + delta - datetime.timedelta(days=1) + if current_end > end_date: + current_end = end_date + periods.append((current_start.strftime("%Y-%m-%d"), current_end.strftime("%Y-%m-%d"))) + + return periods + +def log_execution(start_date, end_date, sequence): + """Log the execution details to the log file.""" + with open(log_file, 'a') as f: + f.write(f"{start_date},{end_date},{sequence}\n") + +def check_quota_exceeded(): + """Check if quota exceeded file exists and is less than 24 hours old.""" + quota_log_file = f"quota_exceeded.log" + if os.path.exists(quota_log_file): + with open(quota_log_file, 'r') as log_file: + lines = log_file.readlines() + if lines: + last_line = lines[-1].strip() + last_exceed_time = datetime.strptime(last_line, '%Y-%m-%d %H:%M:%S') + if datetime.now() - last_exceed_time < timedelta(hours=24): + logging.info("Quota was exceeded less than 24 hours ago. Exiting.") + return True + return False + +def read_last_execution(): + """Read the last execution details from the log file.""" + if not os.path.exists(log_file): + return None, None, None + with open(log_file, 'r') as f: + lines = f.readlines() + if not lines: + return None, None, None + last_line = lines[-1] + last_start, last_end, last_sequence = last_line.strip().split(',') + return last_start, last_end, int(last_sequence) + +def run_analytics_reporter(start_date, end_date, settings, report_id, sequence): + """Run the analytics_reporter.py script with the provided arguments.""" + global interrupted + logger.info(f"Running analytics_reporter.py for period: {start_date} to {end_date}") + cmd = [ + 'python', 'analytics_reporter.py', + '--start', start_date, + '--end', end_date, + '--sequence', str(sequence) + ] + if settings is not None: + cmd.extend(['--settings', settings]) + if report_id is not None: + cmd.extend(['--report_id', str(report_id)]) + + logger.info(f"Command: {cmd}") # Debugging statement + process = subprocess.Popen(cmd) + try: + process.wait() + log_execution(start_date, end_date, sequence) # Log the execution after successful run + except KeyboardInterrupt: + interrupted = True + logger.info("Interrupt received. Waiting for the current report to complete...") + +def signal_handler(sig, frame): + """Signal handler for SIGINT.""" + global interrupted + interrupted = True + logger.info("Interrupt received. Current execution will finish before exiting.") + +def main(): + """Main function to parse arguments, split date range, and call analytics-reporter.py for each period.""" + global interrupted + args = parse_arguments() + + signal.signal(signal.SIGINT, signal_handler) + + last_start, last_end, last_sequence = read_last_execution() + + if last_start and last_end and last_sequence: + # Resume from the last logged entry + periods = split_date_range(last_start, args.end, args.report_level) + start_sequence = last_sequence + else: + # Start from the beginning + periods = split_date_range(args.start, args.end, args.report_level) + start_sequence = 1 + + for sequence, (start_date, end_date) in enumerate(periods, start=start_sequence): + if interrupted: + break + if check_quota_exceeded(): + logging.info("Quota was exceeded recently. Exiting.") + break + run_analytics_reporter(start_date, end_date, args.settings, args.report_id, sequence) + + if interrupted: + logger.info("Execution was interrupted. Exiting after completing the current report.") + +if __name__ == "__main__": + main() From cac1e98edde77067d468b1df7911d0d45fc5989b Mon Sep 17 00:00:00 2001 From: Vimal Joseph Date: Thu, 23 May 2024 07:13:25 +0530 Subject: [PATCH 2/5] Fix dependency issue --- analytics_reporter.py | 1 + ga_data_fetcher.py | 1 + 2 files changed, 2 insertions(+) diff --git a/analytics_reporter.py b/analytics_reporter.py index 8fd82d2..11b222c 100644 --- a/analytics_reporter.py +++ b/analytics_reporter.py @@ -13,6 +13,7 @@ import traceback import json import logging +from datetime import datetime from ga_data_fetcher import get_data from utils import format_date, write_to_csv, append_to_csv, clear_csv_file, clean_name, load_progress, save_progress diff --git a/ga_data_fetcher.py b/ga_data_fetcher.py index 9c05e67..4d79d7c 100644 --- a/ga_data_fetcher.py +++ b/ga_data_fetcher.py @@ -9,6 +9,7 @@ from googleapiclient.discovery import build from oauth2client.service_account import ServiceAccountCredentials from googleapiclient.errors import HttpError +import logging def get_data(api_key, view_id, dimensions, metrics, start_date, end_date, date_formatter, page_size=5000, next_page_token=None, sample_size='DEFAULT', metric_filter=False): # Initialize service From 21846f8df782360edaacd8328dbe1a26862a3b17 Mon Sep 17 00:00:00 2001 From: Vimal Joseph Date: Thu, 23 May 2024 07:18:29 +0530 Subject: [PATCH 3/5] Fixed module import error --- ua_backup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ua_backup.py b/ua_backup.py index e82d223..ab52885 100644 --- a/ua_backup.py +++ b/ua_backup.py @@ -85,8 +85,8 @@ def check_quota_exceeded(): lines = log_file.readlines() if lines: last_line = lines[-1].strip() - last_exceed_time = datetime.strptime(last_line, '%Y-%m-%d %H:%M:%S') - if datetime.now() - last_exceed_time < timedelta(hours=24): + last_exceed_time = datetime.datetime.strptime(last_line, '%Y-%m-%d %H:%M:%S') + if datetime.datetime.now() - last_exceed_time < datetime.timedelta(hours=24): logging.info("Quota was exceeded less than 24 hours ago. Exiting.") return True return False From 4ea89b1b249fdc0e97a4bedb3ddb101f00b03043 Mon Sep 17 00:00:00 2001 From: Vimal Joseph Date: Tue, 28 May 2024 12:05:02 +0530 Subject: [PATCH 4/5] Fix the merge csv bug --- analytics_reporter.py | 2 +- merge_reports.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/analytics_reporter.py b/analytics_reporter.py index 11b222c..7e4b2ce 100644 --- a/analytics_reporter.py +++ b/analytics_reporter.py @@ -226,7 +226,7 @@ def main(): output_file = construct_output_file(property_name, view_id, args.report_id, report_name, args.sequence) logging.info(f"Generate report for {report_name}") - generate_report(report_config, start_date, end_date, api_key, view_id, report_name, output_file, sequence) + generate_report(report_config, args.start, args.end, api_key, view_id, report_name, output_file, args.sequence) else: generate_all_reports(report_configs['reports'], args.start, args.end, api_key, view_id, property_name, args.sequence) diff --git a/merge_reports.py b/merge_reports.py index 5e33dbe..1801051 100644 --- a/merge_reports.py +++ b/merge_reports.py @@ -24,7 +24,7 @@ def merge_report_files(input_dir, output_dir): report_id = match.group(3) report_name = match.group(4) sequence = int(match.group(5)) - + key = f"{base_name}_{view_id}_{report_id}_{report_name}" if key not in report_files: report_files[key] = [] @@ -49,9 +49,9 @@ def merge_report_files(input_dir, output_dir): if start_date is None: start_date = df['ga:date'].iloc[0] end_date = df['ga:date'].iloc[-1] - num_records = len(df) - 1 if not merged_df.empty else len(df) # Subtract header only if it's not the first file + num_records = len(df) # Count all rows, no header adjustment total_records += num_records - merged_df = pd.concat([merged_df, df], ignore_index=True) if merged_df.empty else pd.concat([merged_df, df.iloc[1:]], ignore_index=True) + merged_df = pd.concat([merged_df, df], ignore_index=True) # Write the merged dataframe to CSV merged_df.to_csv(output_file, index=False) @@ -69,4 +69,3 @@ def merge_report_files(input_dir, output_dir): output_dir = args.output_dir merge_report_files(input_dir, output_dir) - From f26c19bbe5da00c31f45e692ff6e441f885ac433 Mon Sep 17 00:00:00 2001 From: Vimal Joseph Date: Tue, 28 May 2024 14:44:42 +0530 Subject: [PATCH 5/5] Update Readme to document the wrapper scripts to avoid sampling --- README.md | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e40523b..c018af9 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ This is a Free Software licenced under GNU GPL v2.0 or above. Please see [What i - `oauth2client` - `pyyaml` - `argparse` +- `pandas` ## Setup and Installation @@ -36,7 +37,7 @@ This is a Free Software licenced under GNU GPL v2.0 or above. Please see [What i ```sh python -m venv venv source venv/bin/activate - pip install google-api-python-client oauth2client pyyaml argparse + pip install google-api-python-client oauth2client pyyaml argparse pandas ``` 3. **Service Account and API Key:** @@ -114,6 +115,26 @@ python3 analytics_reporter.py --report_id 1 --start 2023-01-01 --end 2023-01-31 The script will automatically resume downloading from the last saved progress. +### Avoid Sampling + +The Google Analytics Reporting API may return a sample of sessions if the date range is very large or the number of records in a query is very large. To address this, we have included a wrapper script `ua_backup.py`, which can take the same list of arguments as `analytics_reporter.py`. This script will split the date range into smaller chunks based on the value in the `--report_level` argument. The available options are 'day', 'week', 'month', and 'year'. + +**Example:** + +```sh +python3 ua_backup.py --report_id 1 --start 2020-01-01 --end 2023-01-31 --report_level day +``` + +This will run the query for each day and store the results as separate CSV files in the output folder. The script `merge_report.py` can be used to merge all the individual CSV files into a single CSV file. + +```sh +python3 merge_report.py output/123423_ua-property full_report +``` + +You will get the merged CSV report in the `full_report` folder. + +**Note:** The system uses `ua-backup-execution.log` to keep track of the last script executed to resume execution if any error occurs. It also uses `quota_exceeded.log` to track whether the quota was exceeded. The `_progress.log` is used to track individual reports. If you want to execute the script as a fresh one, starting from the beginning, you should remove these log files. + ### Debugging Try removing the _progress.log file and the csv files generated to restart the download. Check the settings.yml, reports_config.yml files and make sure that the values are correct. Also refer the settings.yml.default and reports_config.yml.example.