From ccb21562849fe092d7b6e557265e89f0006a0460 Mon Sep 17 00:00:00 2001
From: Vimal Joseph <vimaljoseph@gmail.com>
Date: Wed, 22 May 2024 23:17:01 +0530
Subject: [PATCH 1/5] Add features to avoid sampling and exceed quota issues

---
 analytics_reporter.py |  76 +++++++++++++++----
 ga_data_fetcher.py    |  24 +++++-
 merge_reports.py      |  72 ++++++++++++++++++
 ua_backup.py          | 166 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 321 insertions(+), 17 deletions(-)
 create mode 100644 merge_reports.py
 create mode 100644 ua_backup.py

diff --git a/analytics_reporter.py b/analytics_reporter.py
index 91c8b61..8fd82d2 100644
--- a/analytics_reporter.py
+++ b/analytics_reporter.py
@@ -38,16 +38,50 @@ def signal_handler(sig, frame):
     logging.info("You pressed Ctrl+C! Waiting for the current request to complete...")
     interrupted = True
 
-def construct_output_file(property_name, view_id, report_id, report_name):
+def construct_log_file(view_id, report_name, sequence=None, log_type="progress"):
+    """Construct the log file name based on provided parameters."""
+    log_file_name = f"{view_id}_{log_type}.log"
+    if sequence:
+        log_file_name = f"{view_id}_{log_type}.log"
+    return log_file_name
+
+def log_sampling_info(output_dir, view_id, report_name, sampling_info, sequence=None):
+    """Log sampling information to a log file."""
+    sampling_log_file = os.path.join(output_dir, construct_log_file(view_id, report_name, sequence, "sampling"))
+    with open(sampling_log_file, 'a') as log_file:
+        log_file.write(f"Sampling Info for View ID {view_id}, Report: {report_name}:\n")
+        log_file.write(f"  Is Sampled: {sampling_info['is_sampled']}\n")
+        log_file.write(f"  Samples Read Counts: {sampling_info['samples_read_counts']}\n")
+        log_file.write(f"  Sampling Space Sizes: {sampling_info['sampling_space_sizes']}\n")
+        log_file.write("\n")
+
+def construct_output_file(property_name, view_id, report_id, report_name, sequence=None):
     """Construct the output file name based on provided parameters."""
     property_name_clean = clean_name(property_name) if property_name else ""
     report_name_clean = clean_name(report_name)
+
+    output_dir = f"output/{view_id}_{property_name_clean}"
+    os.makedirs(output_dir, exist_ok=True)
+
     if property_name_clean:
-        return f"{property_name_clean}_{view_id}_{report_id}_{report_name_clean}_report.csv"
+        if sequence:
+            return f"{output_dir}/{property_name_clean}_{view_id}_{report_id}_{report_name_clean}_report_{sequence}.csv"
+        else:
+            return f"{output_dir}/{property_name_clean}_{view_id}_{report_id}_{report_name_clean}_report.csv"
     else:
-        return f"{view_id}_{report_id}_{report_name_clean}_report.csv"
+        if sequence:
+            return f"{output_dir}/{view_id}_{report_id}_{report_name_clean}_report_{sequence}.csv"
+        else:
+            return f"{output_dir}/{view_id}_{report_id}_{report_name_clean}_report.csv"
+
+def log_quota_exceeded(view_id):
+    """Log the date and time when quota is exceeded."""
+    quota_log_file = f"quota_exceeded.log"
+    with open(quota_log_file, 'a') as log_file:
+        log_file.write(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+
 
-def generate_report(report_config, start_date, end_date, api_key, view_id, output_file):
+def generate_report(report_config, start_date, end_date, api_key, view_id, report_name, output_file, sequence=None):
     """Generate report based on provided configuration."""
     global interrupted
     success = False
@@ -59,11 +93,12 @@ def generate_report(report_config, start_date, end_date, api_key, view_id, outpu
 
     dimensions = report_config['dimensions']
     metrics = report_config['metrics']
-    page_size = report_config.get('page_size', 1000)  # Default to 1000 if not specified
+    page_size = report_config.get('page_size', 5000)  # Default to 5000 if not specified
     sampling_level = report_config.get('sampling_level', 'DEFAULT')  # Default to 'DEFAULT' if not specified
     metrics_filter = report_config.get('metrics_filter', False)
 
-    progress_file = f"{view_id}_progress.log"
+    output_dir = os.path.dirname(output_file)
+    progress_file = os.path.join(output_dir, construct_log_file(view_id, report_name, sequence, "progress"))
     progress_data = load_progress(progress_file)
 
     total_records_downloaded = 0
@@ -95,7 +130,17 @@ def generate_report(report_config, start_date, end_date, api_key, view_id, outpu
                 break
 
             try:
-                data, next_page_token = get_data(api_key, view_id, dimensions, metrics, start_date, end_date, format_date, page_size, next_page_token, sampling_level, metrics_filter)
+                data, next_page_token, sampling_info, quota_exceeded = get_data(api_key, view_id, dimensions, metrics, start_date, end_date, format_date, page_size, next_page_token, sampling_level, metrics_filter)
+                if quota_exceeded:
+                    log_quota_exceeded(view_id)
+                    break
+                if sampling_info['is_sampled']:
+                  logging.info("Data is sampled.")
+                  logging.info(f"Sampling Read Counts: {sampling_info['samples_read_counts']}")
+                  logging.info(f"Sample Space Sizes:, {sampling_info['sampling_space_sizes']}")
+                  log_sampling_info(output_dir, view_id, report_name, sampling_info, sequence)
+                else:
+                  logging.info("Data is not sampled.")
                 if not data:
                     logging.info(f"No data available to download for {output_file}")
                     break
@@ -116,6 +161,10 @@ def generate_report(report_config, start_date, end_date, api_key, view_id, outpu
                 if not next_page_token or interrupted:
                     break
 
+            except ValueError as e:
+                logging.error(f"ValueError: {e}")
+                logging.error(traceback.format_exc())
+                break  # Exit the loop on any error and save progress
             except Exception as e:
                 logging.error(f"An error occurred while fetching data: {e}")
                 logging.error(traceback.format_exc())
@@ -130,16 +179,16 @@ def generate_report(report_config, start_date, end_date, api_key, view_id, outpu
         if success and data:
             logging.info(f"Data available in CSV file: {output_file}")
 
-def generate_all_reports(report_configs, start_date, end_date, api_key, view_id, property_name):
+def generate_all_reports(report_configs, start_date, end_date, api_key, view_id, property_name, sequence=None):
     """Generate all reports specified in the configuration."""
     property_name_clean = clean_name(property_name) if property_name else ""
 
     for report_config in report_configs:
         report_name = report_config['name']
-        output_file = construct_output_file(property_name, view_id, report_config['id'], report_name)
+        output_file = construct_output_file(property_name, view_id, report_config['id'], report_name, sequence)
 
         logging.info(f"Generating report for {report_name}")
-        generate_report(report_config, start_date, end_date, api_key, view_id, output_file)
+        generate_report(report_config, start_date, end_date, api_key, view_id, report_name, output_file, sequence)
         if interrupted:
             logging.info("Interrupted! Stopping further report generation.")
             break
@@ -153,6 +202,7 @@ def main():
     parser.add_argument('-s', '--start', type=str, required=True, help='Start date (YYYY-MM-DD)')
     parser.add_argument('-e', '--end', type=str, required=True, help='End date (YYYY-MM-DD)')
     parser.add_argument('--settings', type=str, help='Path to settings YAML file')
+    parser.add_argument('--sequence', type=str, help='Optional sequence prefix for the output file name')
     args = parser.parse_args()
 
     settings_file = args.settings if args.settings else "settings.yml"
@@ -172,12 +222,12 @@ def main():
             logging.error(f"Report configuration for ID {args.report_id} not found.")
             return
         report_name = report_config['name']
-        output_file = construct_output_file(property_name, view_id, args.report_id, report_name)
+        output_file = construct_output_file(property_name, view_id, args.report_id, report_name, args.sequence)
 
         logging.info(f"Generate report for {report_name}")
-        generate_report(report_config, args.start, args.end, api_key, view_id, output_file)
+        generate_report(report_config, start_date, end_date, api_key, view_id, report_name, output_file, sequence)
     else:
-        generate_all_reports(report_configs['reports'], args.start, args.end, api_key, view_id, property_name)
+        generate_all_reports(report_configs['reports'], args.start, args.end, api_key, view_id, property_name, args.sequence)
 
 if __name__ == "__main__":
     main()
diff --git a/ga_data_fetcher.py b/ga_data_fetcher.py
index 9d0c46a..9c05e67 100644
--- a/ga_data_fetcher.py
+++ b/ga_data_fetcher.py
@@ -10,7 +10,7 @@
 from oauth2client.service_account import ServiceAccountCredentials
 from googleapiclient.errors import HttpError
 
-def get_data(api_key, view_id, dimensions, metrics, start_date, end_date, date_formatter, page_size=1000, next_page_token=None, sample_size='DEFAULT', metric_filter=False):
+def get_data(api_key, view_id, dimensions, metrics, start_date, end_date, date_formatter, page_size=5000, next_page_token=None, sample_size='DEFAULT', metric_filter=False):
     # Initialize service
     credentials = ServiceAccountCredentials.from_json_keyfile_name(api_key)
     service = build('analyticsreporting', 'v4', credentials=credentials)
@@ -45,6 +45,14 @@ def get_data(api_key, view_id, dimensions, metrics, start_date, end_date, date_f
         column_header_entries = report['columnHeader']['dimensions'] + \
                                 [entry['name'] for entry in report['columnHeader']['metricHeader']['metricHeaderEntries']]
         rows = report.get('data', {}).get('rows', [])
+        samples_read_counts = report.get('data', {}).get('samplesReadCounts', [])
+        sampling_space_sizes = report.get('data', {}).get('samplingSpaceSizes', [])
+        is_sampled = bool(samples_read_counts and sampling_space_sizes)
+        sampling_info = {
+            'is_sampled': is_sampled,
+            'samples_read_counts': samples_read_counts,
+            'sampling_space_sizes': sampling_space_sizes
+        }
 
         for row in rows:
             formatted_row = {}
@@ -61,8 +69,16 @@ def get_data(api_key, view_id, dimensions, metrics, start_date, end_date, date_f
         # Get the next page token, if any
         new_next_page_token = report.get('nextPageToken', None)
 
-        return formatted_data, new_next_page_token
+        return formatted_data, new_next_page_token, sampling_info, False
 
     except HttpError as error:
-        print(f"Error fetching data: {error}")
-        return [], None
+        if error.resp.status == 429:
+            logging.error("Quota Error: Quota exceeded. Please try again later.")
+            return [], None, {'is_sampled': False, 'samples_read_counts': [], 'sampling_space_sizes': []}, True
+        else:
+            logging.error(f"Error fetching data: {error}")
+            return [], None, {'is_sampled': False, 'samples_read_counts': [], 'sampling_space_sizes': []}, False
+    except Exception as e:
+        logging.error(f"An error occurred: {e}")
+        return [], None, {'is_sampled': False, 'samples_read_counts': [], 'sampling_space_sizes': []}, False
+
diff --git a/merge_reports.py b/merge_reports.py
new file mode 100644
index 0000000..5e33dbe
--- /dev/null
+++ b/merge_reports.py
@@ -0,0 +1,72 @@
+import os
+import pandas as pd
+from glob import glob
+import argparse
+import re
+
+def merge_report_files(input_dir, output_dir):
+    # Create output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Dictionary to hold report file information
+    report_files = {}
+
+    # Regex to match report files
+    report_file_pattern = re.compile(r'^(.*)_(\d+)_(\d+)_([a-zA-Z-]+)_report_(\d+)\.csv$')
+
+    # Find all matching files
+    for filepath in glob(os.path.join(input_dir, '*.csv')):
+        filename = os.path.basename(filepath)
+        match = report_file_pattern.match(filename)
+        if match:
+            base_name = match.group(1)
+            view_id = match.group(2)
+            report_id = match.group(3)
+            report_name = match.group(4)
+            sequence = int(match.group(5))
+            
+            key = f"{base_name}_{view_id}_{report_id}_{report_name}"
+            if key not in report_files:
+                report_files[key] = []
+            report_files[key].append((sequence, filepath))
+        else:
+            print(f"Skipping file {filename}: Filename does not match expected pattern")
+
+    # Merge files for each report
+    with open(os.path.join(output_dir, 'all_reports.log'), 'w') as log_file:
+        for key, files in report_files.items():
+            files.sort()  # Sort files by sequence number
+            output_file = os.path.join(output_dir, f"{key}_report_full.csv")
+            total_records = 0
+            start_date = None
+            end_date = None
+            num_files_merged = len(files)
+
+            merged_df = pd.DataFrame()
+            for seq, filepath in files:
+                df = pd.read_csv(filepath)
+                if 'ga:date' in df.columns:
+                    if start_date is None:
+                        start_date = df['ga:date'].iloc[0]
+                    end_date = df['ga:date'].iloc[-1]
+                num_records = len(df) - 1 if not merged_df.empty else len(df)  # Subtract header only if it's not the first file
+                total_records += num_records
+                merged_df = pd.concat([merged_df, df], ignore_index=True) if merged_df.empty else pd.concat([merged_df, df.iloc[1:]], ignore_index=True)
+
+            # Write the merged dataframe to CSV
+            merged_df.to_csv(output_file, index=False)
+
+            log_file.write(f"{output_file},{start_date or ''},{end_date or ''},{num_files_merged},{total_records}\n")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Merge report files into a single file for each report.')
+    parser.add_argument('input_dir', type=str, help='Path to the input directory containing report files.')
+    parser.add_argument('output_dir', type=str, nargs='?', default='.', help='Path to the output directory where merged files will be stored. Defaults to current directory.')
+
+    args = parser.parse_args()
+
+    input_dir = args.input_dir
+    output_dir = args.output_dir
+
+    merge_report_files(input_dir, output_dir)
+
diff --git a/ua_backup.py b/ua_backup.py
new file mode 100644
index 0000000..e82d223
--- /dev/null
+++ b/ua_backup.py
@@ -0,0 +1,166 @@
+import argparse
+import subprocess
+import datetime
+import signal
+import os
+import logging
+
+# Initialize logger
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+interrupted = False
+log_file = "ua-backup-execution.log"
+
+def parse_arguments():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(description='Wrapper script for analytics-reporter.py')
+    parser.add_argument('--start', type=str, required=True, help='Start date (YYYY-MM-DD)')
+    parser.add_argument('--end', type=str, required=True, help='End date (YYYY-MM-DD)')
+    parser.add_argument('--settings', type=str, help='Path to settings YAML file')
+    parser.add_argument('--report_id', type=int, help='ID of the report to generate')
+    parser.add_argument('--report_level', type=str, choices=['day', 'week', 'month', 'year'], required=True, help='Report level to split date range')
+    args = parser.parse_args()
+    return args
+
+def date_range(start_date, end_date, delta):
+    """Generate a range of dates."""
+    current_date = start_date
+    while current_date <= end_date:
+        yield current_date
+        current_date += delta
+
+def split_date_range(start, end, report_level):
+    """Split the date range based on the report level."""
+    start_date = datetime.datetime.strptime(start, "%Y-%m-%d")
+    end_date = datetime.datetime.strptime(end, "%Y-%m-%d")
+
+    periods = []
+    if report_level == 'day':
+        delta = datetime.timedelta(days=1)
+        for current_start in date_range(start_date, end_date, delta):
+            current_end = current_start + delta - datetime.timedelta(days=1)
+            periods.append((current_start.strftime("%Y-%m-%d"), current_end.strftime("%Y-%m-%d")))
+    elif report_level == 'week':
+        delta = datetime.timedelta(weeks=1)
+        for current_start in date_range(start_date, end_date, delta):
+            current_end = current_start + delta - datetime.timedelta(days=1)
+            if current_end > end_date:
+                current_end = end_date
+            periods.append((current_start.strftime("%Y-%m-%d"), current_end.strftime("%Y-%m-%d")))
+    elif report_level == 'month':
+        def add_months(dt, months):
+            month = dt.month - 1 + months
+            year = dt.year + month // 12
+            month = month % 12 + 1
+            day = min(dt.day, [31, 29 if year % 4 == 0 and not year % 100 == 0 or year % 400 == 0 else 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31][month - 1])
+            return dt.replace(year=year, month=month, day=day)
+        current_start = start_date
+        while current_start <= end_date:
+            current_end = add_months(current_start, 1) - datetime.timedelta(days=1)
+            if current_end > end_date:
+                current_end = end_date
+            periods.append((current_start.strftime("%Y-%m-%d"), current_end.strftime("%Y-%m-%d")))
+            current_start = current_end + datetime.timedelta(days=1)
+    elif report_level == 'year':
+        delta = datetime.timedelta(days=365)  # Approximation
+        for current_start in date_range(start_date, end_date, delta):
+            current_end = current_start + delta - datetime.timedelta(days=1)
+            if current_end > end_date:
+                current_end = end_date
+            periods.append((current_start.strftime("%Y-%m-%d"), current_end.strftime("%Y-%m-%d")))
+
+    return periods
+
+def log_execution(start_date, end_date, sequence):
+    """Log the execution details to the log file."""
+    with open(log_file, 'a') as f:
+        f.write(f"{start_date},{end_date},{sequence}\n")
+
+def check_quota_exceeded():
+    """Check if quota exceeded file exists and is less than 24 hours old."""
+    quota_log_file = f"quota_exceeded.log"
+    if os.path.exists(quota_log_file):
+        with open(quota_log_file, 'r') as log_file:
+            lines = log_file.readlines()
+            if lines:
+                last_line = lines[-1].strip()
+                last_exceed_time = datetime.strptime(last_line, '%Y-%m-%d %H:%M:%S')
+                if datetime.now() - last_exceed_time < timedelta(hours=24):
+                    logging.info("Quota was exceeded less than 24 hours ago. Exiting.")
+                    return True
+    return False
+
+def read_last_execution():
+    """Read the last execution details from the log file."""
+    if not os.path.exists(log_file):
+        return None, None, None
+    with open(log_file, 'r') as f:
+        lines = f.readlines()
+        if not lines:
+            return None, None, None
+        last_line = lines[-1]
+        last_start, last_end, last_sequence = last_line.strip().split(',')
+        return last_start, last_end, int(last_sequence)
+
+def run_analytics_reporter(start_date, end_date, settings, report_id, sequence):
+    """Run the analytics_reporter.py script with the provided arguments."""
+    global interrupted
+    logger.info(f"Running analytics_reporter.py for period: {start_date} to {end_date}")
+    cmd = [
+        'python', 'analytics_reporter.py',
+        '--start', start_date,
+        '--end', end_date,
+        '--sequence', str(sequence)
+    ]
+    if settings is not None:
+        cmd.extend(['--settings', settings])
+    if report_id is not None:
+        cmd.extend(['--report_id', str(report_id)])
+
+    logger.info(f"Command: {cmd}")  # Debugging statement
+    process = subprocess.Popen(cmd)
+    try:
+        process.wait()
+        log_execution(start_date, end_date, sequence)  # Log the execution after successful run
+    except KeyboardInterrupt:
+        interrupted = True
+        logger.info("Interrupt received. Waiting for the current report to complete...")
+
+def signal_handler(sig, frame):
+    """Signal handler for SIGINT."""
+    global interrupted
+    interrupted = True
+    logger.info("Interrupt received. Current execution will finish before exiting.")
+
+def main():
+    """Main function to parse arguments, split date range, and call analytics-reporter.py for each period."""
+    global interrupted
+    args = parse_arguments()
+
+    signal.signal(signal.SIGINT, signal_handler)
+
+    last_start, last_end, last_sequence = read_last_execution()
+
+    if last_start and last_end and last_sequence:
+        # Resume from the last logged entry
+        periods = split_date_range(last_start, args.end, args.report_level)
+        start_sequence = last_sequence
+    else:
+        # Start from the beginning
+        periods = split_date_range(args.start, args.end, args.report_level)
+        start_sequence = 1
+
+    for sequence, (start_date, end_date) in enumerate(periods, start=start_sequence):
+        if interrupted:
+            break
+        if check_quota_exceeded():
+            logging.info("Quota was exceeded recently. Exiting.")
+            break
+        run_analytics_reporter(start_date, end_date, args.settings, args.report_id, sequence)
+
+    if interrupted:
+        logger.info("Execution was interrupted. Exiting after completing the current report.")
+
+if __name__ == "__main__":
+    main()

From cac1e98edde77067d468b1df7911d0d45fc5989b Mon Sep 17 00:00:00 2001
From: Vimal Joseph <vimaljoseph@gmail.com>
Date: Thu, 23 May 2024 07:13:25 +0530
Subject: [PATCH 2/5] Fix dependency issue

---
 analytics_reporter.py | 1 +
 ga_data_fetcher.py    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/analytics_reporter.py b/analytics_reporter.py
index 8fd82d2..11b222c 100644
--- a/analytics_reporter.py
+++ b/analytics_reporter.py
@@ -13,6 +13,7 @@
 import traceback
 import json
 import logging
+from datetime import datetime
 from ga_data_fetcher import get_data
 from utils import format_date, write_to_csv, append_to_csv, clear_csv_file, clean_name, load_progress, save_progress
 
diff --git a/ga_data_fetcher.py b/ga_data_fetcher.py
index 9c05e67..4d79d7c 100644
--- a/ga_data_fetcher.py
+++ b/ga_data_fetcher.py
@@ -9,6 +9,7 @@
 from googleapiclient.discovery import build
 from oauth2client.service_account import ServiceAccountCredentials
 from googleapiclient.errors import HttpError
+import logging
 
 def get_data(api_key, view_id, dimensions, metrics, start_date, end_date, date_formatter, page_size=5000, next_page_token=None, sample_size='DEFAULT', metric_filter=False):
     # Initialize service

From 21846f8df782360edaacd8328dbe1a26862a3b17 Mon Sep 17 00:00:00 2001
From: Vimal Joseph <vimaljoseph@gmail.com>
Date: Thu, 23 May 2024 07:18:29 +0530
Subject: [PATCH 3/5] Fixed module import error

---
 ua_backup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ua_backup.py b/ua_backup.py
index e82d223..ab52885 100644
--- a/ua_backup.py
+++ b/ua_backup.py
@@ -85,8 +85,8 @@ def check_quota_exceeded():
             lines = log_file.readlines()
             if lines:
                 last_line = lines[-1].strip()
-                last_exceed_time = datetime.strptime(last_line, '%Y-%m-%d %H:%M:%S')
-                if datetime.now() - last_exceed_time < timedelta(hours=24):
+                last_exceed_time = datetime.datetime.strptime(last_line, '%Y-%m-%d %H:%M:%S')
+                if datetime.datetime.now() - last_exceed_time < datetime.timedelta(hours=24):
                     logging.info("Quota was exceeded less than 24 hours ago. Exiting.")
                     return True
     return False

From 4ea89b1b249fdc0e97a4bedb3ddb101f00b03043 Mon Sep 17 00:00:00 2001
From: Vimal Joseph <vimaljoseph@gmail.com>
Date: Tue, 28 May 2024 12:05:02 +0530
Subject: [PATCH 4/5] Fix the merge csv bug

---
 analytics_reporter.py | 2 +-
 merge_reports.py      | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/analytics_reporter.py b/analytics_reporter.py
index 11b222c..7e4b2ce 100644
--- a/analytics_reporter.py
+++ b/analytics_reporter.py
@@ -226,7 +226,7 @@ def main():
         output_file = construct_output_file(property_name, view_id, args.report_id, report_name, args.sequence)
 
         logging.info(f"Generate report for {report_name}")
-        generate_report(report_config, start_date, end_date, api_key, view_id, report_name, output_file, sequence)
+        generate_report(report_config, args.start, args.end, api_key, view_id, report_name, output_file, args.sequence)
     else:
         generate_all_reports(report_configs['reports'], args.start, args.end, api_key, view_id, property_name, args.sequence)
 
diff --git a/merge_reports.py b/merge_reports.py
index 5e33dbe..1801051 100644
--- a/merge_reports.py
+++ b/merge_reports.py
@@ -24,7 +24,7 @@ def merge_report_files(input_dir, output_dir):
             report_id = match.group(3)
             report_name = match.group(4)
             sequence = int(match.group(5))
-            
+
             key = f"{base_name}_{view_id}_{report_id}_{report_name}"
             if key not in report_files:
                 report_files[key] = []
@@ -49,9 +49,9 @@ def merge_report_files(input_dir, output_dir):
                     if start_date is None:
                         start_date = df['ga:date'].iloc[0]
                     end_date = df['ga:date'].iloc[-1]
-                num_records = len(df) - 1 if not merged_df.empty else len(df)  # Subtract header only if it's not the first file
+                num_records = len(df)  # Count all rows, no header adjustment
                 total_records += num_records
-                merged_df = pd.concat([merged_df, df], ignore_index=True) if merged_df.empty else pd.concat([merged_df, df.iloc[1:]], ignore_index=True)
+                merged_df = pd.concat([merged_df, df], ignore_index=True)
 
             # Write the merged dataframe to CSV
             merged_df.to_csv(output_file, index=False)
@@ -69,4 +69,3 @@ def merge_report_files(input_dir, output_dir):
     output_dir = args.output_dir
 
     merge_report_files(input_dir, output_dir)
-

From f26c19bbe5da00c31f45e692ff6e441f885ac433 Mon Sep 17 00:00:00 2001
From: Vimal Joseph <vimaljoseph@gmail.com>
Date: Tue, 28 May 2024 14:44:42 +0530
Subject: [PATCH 5/5] Update Readme to document the wrapper scripts to avoid
 sampling

---
 README.md | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e40523b..c018af9 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,7 @@ This is a Free Software licenced under GNU GPL v2.0 or above. Please see [What i
 - `oauth2client`
 - `pyyaml`
 - `argparse`
+- `pandas`
 
 ## Setup and Installation
 
@@ -36,7 +37,7 @@ This is a Free Software licenced under GNU GPL v2.0 or above. Please see [What i
     ```sh
     python -m venv venv
     source venv/bin/activate
-    pip install google-api-python-client oauth2client pyyaml argparse
+    pip install google-api-python-client oauth2client pyyaml argparse pandas
     ```
 
 3. **Service Account and API Key:**
@@ -114,6 +115,26 @@ python3 analytics_reporter.py --report_id 1 --start 2023-01-01 --end 2023-01-31
 
 The script will automatically resume downloading from the last saved progress.
 
+### Avoid Sampling
+
+The Google Analytics Reporting API may return a sample of sessions if the date range is very large or the number of records in a query is very large. To address this, we have included a wrapper script `ua_backup.py`, which can take the same list of arguments as `analytics_reporter.py`. This script will split the date range into smaller chunks based on the value in the `--report_level` argument. The available options are 'day', 'week', 'month', and 'year'.
+
+**Example:**
+
+```sh
+python3 ua_backup.py --report_id 1 --start 2020-01-01 --end 2023-01-31 --report_level day
+```
+
+This will run the query for each day and store the results as separate CSV files in the output folder. The script `merge_report.py` can be used to merge all the individual CSV files into a single CSV file.
+
+```sh
+python3 merge_report.py output/123423_ua-property full_report
+```
+
+You will get the merged CSV report in the `full_report` folder.
+
+**Note:** The system uses `ua-backup-execution.log` to keep track of the last script executed to resume execution if any error occurs. It also uses `quota_exceeded.log` to track whether the quota was exceeded. The `<view-id>_progress.log` is used to track individual reports. If you want to execute the script as a fresh one, starting from the beginning, you should remove these log files.
+
 ### Debugging
 
 Try removing the <view-id>_progress.log file and the csv files generated to restart the download. Check the settings.yml, reports_config.yml files and make sure that the values are correct. Also refer the settings.yml.default and reports_config.yml.example.