Fix get_intermediate_stats_df (#3190)

amCap1712 · web-flow · commit 494ec5edd68c · 2025-02-20T17:00:45.000+05:30
When loading intermediate stats dataframe if start and end is not provided,
it reads the file as is without excluding year and month column used for
partitioning. This causes an error when trying to combining it with the
incremental listens dataframe which doesn't have the two fields. Hence,
fix the loading.
diff --git a/listenbrainz_spark/utils/__init__.py b/listenbrainz_spark/utils/__init__.py
@@ -177,16 +177,15 @@ def get_listens_from_dump(start: datetime, end: datetime, include_incremental=Tr
 
 def get_intermediate_stats_df(start: datetime, end: datetime):
     if start is None and end is None:
-        return read_files_from_HDFS(LISTENBRAINZ_INTERMEDIATE_STATS_DIRECTORY)
-
-    filters = []
-
-    current = start
-    step = relativedelta(months=1)
-    while current <= end:
-        filters.append(f"(year = {current.year} AND month = {current.month})")
-        current += step
-    combined_filter = "(\n       " + "\n    OR ".join(filters) + "\n       )"
+        where_clause = ""
+    else:
+        filters = []
+        current = start
+        step = relativedelta(months=1)
+        while current <= end:
+            filters.append(f"(year = {current.year} AND month = {current.month})")
+            current += step
+        where_clause = "where (\n       " + "\n    OR ".join(filters) + "\n       )"
 
     query = dedent(f"""\
         select listened_at
@@ -201,7 +200,7 @@ def get_intermediate_stats_df(start: datetime, end: datetime):
              , recording_mbid
              , artist_credit_mbids
           from parquet.`{LISTENBRAINZ_INTERMEDIATE_STATS_DIRECTORY}`
-         where """) + combined_filter
+    """) + where_clause
     return listenbrainz_spark.sql_context.sql(query)