From bdc4457e1c5e9a73bdbcc2e5c1ec4463966ee9ce Mon Sep 17 00:00:00 2001 From: Osundwa Jeff Date: Wed, 19 Feb 2025 20:54:16 +0300 Subject: [PATCH] fix: Remove "time" field from dataset if no time column exists (#429) * fix: Remove "time" field from dataset if no time column exists * fix: Enhance datetime handling by conditionally combining date and time columns * test: Add unit test for to_json * test: rm blank line in test_to_json method, fix flake8 --- django_project/gap/providers/observation.py | 19 +++++-- .../gap/tests/providers/test_observation.py | 51 +++++++++++++++++++ 2 files changed, 65 insertions(+), 5 deletions(-) diff --git a/django_project/gap/providers/observation.py b/django_project/gap/providers/observation.py index 8278b1e..3db44e8 100644 --- a/django_project/gap/providers/observation.py +++ b/django_project/gap/providers/observation.py @@ -584,11 +584,20 @@ def to_json(self): } # Convert query results to a DataFrame df = self.conn.sql(self.query).df() - # Combine date and time columns - df['datetime'] = pd.to_datetime( - df['date'].dt.strftime('%Y-%m-%d') + ' ' + df['time'] - ) - df = df.drop(columns=['date', 'time', 'lat', 'lon']) + + if self.has_time_column and 'time' in df.columns: + # Combine date and time columns if time column exists + df['datetime'] = pd.to_datetime( + df['date'].dt.strftime('%Y-%m-%d') + ' ' + df['time'] + ) + drop_columns = ['date', 'time'] + else: + # If dataset lacks time, only use the date + df['datetime'] = df['date'] + drop_columns = ['date'] + drop_columns.extend(['lat', 'lon']) + # Drop unnecessary columns safely + df = df.drop(columns=drop_columns, errors='ignore') # Replace NaN with None df = df.replace({np.nan: None}) output['data'] = df.to_dict(orient="records") diff --git a/django_project/gap/tests/providers/test_observation.py b/django_project/gap/tests/providers/test_observation.py index 9a8b514..a7fdf7f 100644 --- a/django_project/gap/tests/providers/test_observation.py +++ b/django_project/gap/tests/providers/test_observation.py @@ -9,6 +9,7 @@ import duckdb import xarray as xr import pandas as pd +import numpy as np from django.test import TestCase from datetime import datetime @@ -892,3 +893,53 @@ def test_to_netcdf_drops_station_id_and_sets_index( # Ensure NetCDF file was saved mock_s3_storage.save.assert_called_once() self.assertEqual(netcdf_output, "s3://test-bucket/output.nc") + + @patch("gap.providers.observation.duckdb.connect") + def test_to_json(self, mock_duckdb_connect): + """Test to_json handles NaN values and removes unnecessary columns.""" + # Mock DuckDB connection + mock_conn = MagicMock() + mock_duckdb_connect.return_value = mock_conn + + # Mock SQL query result + mock_conn.sql.return_value.df.return_value = pd.DataFrame({ + "date": pd.date_range(start="2023-01-01", periods=3), + "time": ["12:00:00", "14:00:00", None], # Some missing times + "lat": [0.5, 0.6, None], # Drop lat + "lon": [36.5, None, 36.7], # Drop lon + "value": [100, np.nan, 300] # Include NaN + }) + + # Create reader instance + location_input = DatasetReaderInput.from_point(Point(36.8, -1.3)) + reader_value = ObservationParquetReaderValue( + mock_conn, + location_input, + [], + datetime(2023, 1, 1), + datetime(2023, 1, 3), + "SELECT * FROM test" + ) + + # Mock has_time_column to avoid modifying it directly + with patch.object( + ObservationParquetReaderValue, + "has_time_column", + return_value=True + ): + output = reader_value.to_json() + + # Ensure 'data' is present + self.assertIn("data", output) + self.assertEqual(len(output["data"]), 3) + + # Ensure 'datetime' is present and formatted + for entry in output["data"]: + self.assertIn("datetime", entry) + self.assertNotIn("date", entry) # Ensure 'date' was removed + self.assertNotIn("time", entry) # Ensure 'time' was merged + self.assertNotIn("lat", entry) # Ensure 'lat' was removed + self.assertNotIn("lon", entry) # Ensure 'lon' was removed + + # Validate NaN conversion to None + self.assertIsNone(output["data"][1].get("value"))