Skip to content

Commit f866646

Browse files
authored
More granular incremental stats (#3164)
* Experiment more granular incremental stats filter Currently, the entirety of incremental dumps is checked to filter entities whose stats/popularity needs to be recomputed. This can be further optimized by storing the latest created timestamp of the listens in incremental dumps when a stat is run. The next time a stat is run, only listens with a higher created value (added through newer incremental dumps) are considered for the filter. Note that the incremental aggregate is still computed from all of incremental listens dump, only the filter is made more granular. * Update popularity providers * fix SitewideStatsQueryProvider * fix str-ing path where needed * fix where clause * fix where clause - 2 * debug * debug - 2 * need to filter incremental aggregate too * debug 3 * debug 4 * fix popularity generation * fix function name in sitewide stats
1 parent ec0f52d commit f866646

File tree

6 files changed

+89
-29
lines changed

6 files changed

+89
-29
lines changed

listenbrainz_spark/popularity/listens.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
from typing import List
1+
from datetime import datetime
2+
from typing import List, Optional
23

34
from listenbrainz_spark.path import LISTENBRAINZ_POPULARITY_DIRECTORY, RELEASE_METADATA_CACHE_DATAFRAME
45
from listenbrainz_spark.popularity.common import get_popularity_per_artist_query, \
@@ -23,11 +24,13 @@ def get_table_prefix(self) -> str:
2324
def get_base_path(self) -> str:
2425
return LISTENBRAINZ_POPULARITY_DIRECTORY
2526

26-
def get_filter_aggregate_query(self, existing_aggregate: str, incremental_aggregate: str) -> str:
27+
def get_filter_aggregate_query(self, existing_aggregate: str, incremental_aggregate: str,
28+
existing_created: Optional[datetime]) -> str:
29+
inc_where_clause = f"WHERE created >= to_timestamp('{existing_created}')" if existing_created else ""
2730
entity_id = self.get_entity_id()
2831
return f"""
2932
WITH incremental_users AS (
30-
SELECT DISTINCT {entity_id} FROM {incremental_aggregate}
33+
SELECT DISTINCT {entity_id} FROM {incremental_aggregate} {inc_where_clause}
3134
)
3235
SELECT *
3336
FROM {existing_aggregate} ea
@@ -91,11 +94,13 @@ def get_table_prefix(self) -> str:
9194
def get_base_path(self) -> str:
9295
return LISTENBRAINZ_POPULARITY_DIRECTORY
9396

94-
def get_filter_aggregate_query(self, existing_aggregate: str, incremental_aggregate: str) -> str:
97+
def get_filter_aggregate_query(self, existing_aggregate: str, incremental_aggregate: str,
98+
existing_created: Optional[datetime]) -> str:
99+
inc_where_clause = f"WHERE created >= to_timestamp('{existing_created}')" if existing_created else ""
95100
entity_id = self.get_entity_id()
96101
return f"""
97102
WITH incremental_artists AS (
98-
SELECT DISTINCT artist_mbid, {entity_id} FROM {incremental_aggregate}
103+
SELECT DISTINCT artist_mbid, {entity_id} FROM {incremental_aggregate} {inc_where_clause}
99104
)
100105
SELECT *
101106
FROM {existing_aggregate} ea

listenbrainz_spark/schema.py

+5
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,12 @@
1010
BOOKKEEPING_SCHEMA = StructType([
1111
StructField('from_date', TimestampType(), nullable=False),
1212
StructField('to_date', TimestampType(), nullable=False),
13+
StructField('updated_at', TimestampType(), nullable=False),
14+
])
15+
16+
INCREMENTAL_BOOKKEEPING_SCHEMA = StructType([
1317
StructField('created', TimestampType(), nullable=False),
18+
StructField('updated_at', TimestampType(), nullable=False),
1419
])
1520

1621
mlhd_schema = StructType([

listenbrainz_spark/stats/incremental/incremental_stats_engine.py

+53-12
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from listenbrainz_spark import hdfs_connection
1111
from listenbrainz_spark.config import HDFS_CLUSTER_URI
1212
from listenbrainz_spark.path import INCREMENTAL_DUMPS_SAVE_PATH
13-
from listenbrainz_spark.schema import BOOKKEEPING_SCHEMA
13+
from listenbrainz_spark.schema import BOOKKEEPING_SCHEMA, INCREMENTAL_BOOKKEEPING_SCHEMA
1414
from listenbrainz_spark.stats import run_query
1515
from listenbrainz_spark.stats.incremental.message_creator import MessageCreator
1616
from listenbrainz_spark.stats.incremental.query_provider import QueryProvider
@@ -50,6 +50,7 @@ def __init__(self, provider: QueryProvider, message_creator: MessageCreator):
5050
self.message_creator = message_creator
5151
self._cache_tables = []
5252
self._only_inc = None
53+
self.incremental_table = None
5354

5455
@property
5556
def only_inc(self):
@@ -68,7 +69,7 @@ def _setup_cache_tables(self):
6869

6970
def partial_aggregate_usable(self) -> bool:
7071
""" Checks whether a partial aggregate exists and is fresh to generate the required stats. """
71-
metadata_path = self.provider.get_bookkeeping_path()
72+
metadata_path = f"{self.provider.get_bookkeeping_path()}/full"
7273
existing_aggregate_path = self.provider.get_existing_aggregate_path()
7374

7475
try:
@@ -95,7 +96,7 @@ def create_partial_aggregate(self) -> DataFrame:
9596
Returns:
9697
DataFrame: The generated partial aggregate DataFrame.
9798
"""
98-
metadata_path = self.provider.get_bookkeeping_path()
99+
metadata_path = f"{self.provider.get_bookkeeping_path()}/full"
99100
existing_aggregate_path = self.provider.get_existing_aggregate_path()
100101

101102
table = f"{self.provider.get_table_prefix()}_full_listens"
@@ -129,12 +130,35 @@ def create_incremental_aggregate(self) -> DataFrame:
129130
Returns:
130131
DataFrame: The generated incremental aggregate DataFrame.
131132
"""
132-
table = f"{self.provider.get_table_prefix()}_incremental_listens"
133+
self.incremental_table = f"{self.provider.get_table_prefix()}_incremental_listens"
133134
read_files_from_HDFS(INCREMENTAL_DUMPS_SAVE_PATH) \
134-
.createOrReplaceTempView(table)
135-
inc_query = self.provider.get_aggregate_query(table, self._cache_tables)
135+
.createOrReplaceTempView(self.incremental_table)
136+
inc_query = self.provider.get_aggregate_query(self.incremental_table, self._cache_tables)
136137
return run_query(inc_query)
137138

139+
def bookkeep_incremental_aggregate(self):
140+
metadata_path = f"{self.provider.get_bookkeeping_path()}/incremental"
141+
query = f"SELECT max(created) AS latest_created_at FROM {self.incremental_table}"
142+
latest_created_at = run_query(query).collect()[0]["latest_created_at"]
143+
metadata_df = listenbrainz_spark.session.createDataFrame(
144+
[(latest_created_at, datetime.now())],
145+
schema=INCREMENTAL_BOOKKEEPING_SCHEMA
146+
)
147+
metadata_df.write.mode("overwrite").json(metadata_path)
148+
149+
def get_incremental_dumps_existing_created(self):
150+
metadata_path = f"{self.provider.get_bookkeeping_path()}/incremental"
151+
try:
152+
metadata = listenbrainz_spark \
153+
.session \
154+
.read \
155+
.schema(INCREMENTAL_BOOKKEEPING_SCHEMA) \
156+
.json(f"{HDFS_CLUSTER_URI}{metadata_path}") \
157+
.collect()[0]
158+
return metadata["created"]
159+
except AnalysisException:
160+
return None
161+
138162
def generate_stats(self) -> DataFrame:
139163
self._setup_cache_tables()
140164
prefix = self.provider.get_table_prefix()
@@ -155,14 +179,30 @@ def generate_stats(self) -> DataFrame:
155179
inc_df.createOrReplaceTempView(inc_table)
156180

157181
if self._only_inc:
158-
filter_query = self.provider.get_filter_aggregate_query(partial_table, inc_table)
159-
filtered_aggregate_df = run_query(filter_query)
160-
filtered_table = f"{prefix}_filtered_aggregate"
161-
filtered_aggregate_df.createOrReplaceTempView(filtered_table)
182+
existing_created = self.get_incremental_dumps_existing_created()
183+
184+
filter_existing_query = self.provider.get_filter_aggregate_query(
185+
partial_table,
186+
self.incremental_table,
187+
existing_created
188+
)
189+
filtered_existing_aggregate_df = run_query(filter_existing_query)
190+
filtered_existing_table = f"{prefix}_filtered_existing_aggregate"
191+
filtered_existing_aggregate_df.createOrReplaceTempView(filtered_existing_table)
192+
193+
filter_incremental_query = self.provider.get_filter_aggregate_query(
194+
inc_table,
195+
self.incremental_table,
196+
existing_created
197+
)
198+
filtered_incremental_aggregate_df = run_query(filter_incremental_query)
199+
filtered_incremental_table = f"{prefix}_filtered_incremental_aggregate"
200+
filtered_incremental_aggregate_df.createOrReplaceTempView(filtered_incremental_table)
162201
else:
163-
filtered_table = partial_table
202+
filtered_existing_table = partial_table
203+
filtered_incremental_table = inc_table
164204

165-
final_query = self.provider.get_combine_aggregates_query(filtered_table, inc_table)
205+
final_query = self.provider.get_combine_aggregates_query(filtered_existing_table, filtered_incremental_table)
166206
final_df = run_query(final_query)
167207
else:
168208
final_df = partial_df
@@ -183,3 +223,4 @@ def run(self) -> Iterator[Dict]:
183223
yield message
184224
if not self.only_inc:
185225
yield self.message_creator.create_end_message()
226+
self.bookkeep_incremental_aggregate()

listenbrainz_spark/stats/incremental/query_provider.py

+8-7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import abc
2-
from typing import List
2+
from datetime import datetime
3+
from typing import List, Optional
34

45
from listenbrainz_spark.stats.incremental.range_selector import ListenRangeSelector
56

@@ -41,7 +42,7 @@ def get_existing_aggregate_path(self) -> str:
4142
return f"{self.get_base_path()}/aggregates/{self.entity}/{self.stats_range}"
4243

4344
def get_bookkeeping_path(self) -> str:
44-
""" Returns the HDFS path for bookkeeping metadata. """
45+
""" Returns the HDFS path for bookkeeping metadata directory. """
4546
return f"{self.get_base_path()}/bookkeeping/{self.entity}/{self.stats_range}"
4647

4748
@abc.abstractmethod
@@ -68,14 +69,14 @@ def get_combine_aggregates_query(self, existing_aggregate: str, incremental_aggr
6869
raise NotImplementedError()
6970

7071
@abc.abstractmethod
71-
def get_filter_aggregate_query(self, existing_aggregate: str, incremental_aggregate: str) -> str:
72+
def get_filter_aggregate_query(self, aggregate: str, inc_listens_table: str, existing_created: datetime) -> str:
7273
"""
73-
Return the query to filter the existing aggregate based on the listens present in incremental
74-
aggregate.
74+
Return the query to filter the aggregate based on the listens submitted since existing created timestamp.
7575
7676
Args:
77-
existing_aggregate: The table name for existing aggregate.
78-
incremental_aggregate: The table name for incremental aggregate.
77+
aggregate: The table name for the aggregate to filter
78+
inc_listens_table: The table name for incremental listens.
79+
existing_created: The max listen created value last time incremental stats for this query was run.
7980
"""
8081
raise NotImplementedError()
8182

listenbrainz_spark/stats/incremental/sitewide/entity.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import abc
22
import logging
3-
from typing import Iterator, Dict
3+
from datetime import datetime
4+
from typing import Iterator, Dict, Optional
45

56
from pydantic import ValidationError
67
from pyspark.sql import DataFrame
@@ -34,7 +35,8 @@ def get_base_path(self) -> str:
3435
def get_table_prefix(self) -> str:
3536
return f"sitewide_{self.entity}_{self.stats_range}"
3637

37-
def get_filter_aggregate_query(self, existing_aggregate: str, incremental_aggregate: str) -> str:
38+
def get_filter_aggregate_query(self, existing_aggregate: str, incremental_aggregate: str,
39+
existed_created: Optional[datetime]) -> str:
3840
return f"SELECT * FROM {existing_aggregate}"
3941

4042

listenbrainz_spark/stats/incremental/user/entity.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from data.model.user_release_group_stat import ReleaseGroupRecord
1212
from data.model.user_release_stat import ReleaseRecord
1313
from listenbrainz_spark.path import LISTENBRAINZ_USER_STATS_DIRECTORY
14+
from listenbrainz_spark.stats import run_query
1415
from listenbrainz_spark.stats.incremental.message_creator import StatsMessageCreator
1516
from listenbrainz_spark.stats.incremental.query_provider import QueryProvider
1617
from listenbrainz_spark.stats.incremental.range_selector import ListenRangeSelector
@@ -37,17 +38,22 @@ def get_table_prefix(self) -> str:
3738
def get_entity_id(self):
3839
return "user_id"
3940

40-
def get_filter_aggregate_query(self, existing_aggregate, incremental_aggregate):
41+
def get_filter_aggregate_query(self, aggregate, inc_listens_table, existing_created):
4142
""" Filter listens from existing aggregate to only include listens for entities having listens in the
4243
incremental dumps.
4344
"""
4445
entity_id = self.get_entity_id()
46+
inc_clause = f"""
47+
SELECT DISTINCT {entity_id}
48+
FROM {inc_listens_table}
49+
WHERE created >= to_timestamp('{existing_created}')
50+
"""
4551
return f"""
4652
WITH incremental_users AS (
47-
SELECT DISTINCT {entity_id} FROM {incremental_aggregate}
53+
{inc_clause}
4854
)
4955
SELECT *
50-
FROM {existing_aggregate} ea
56+
FROM {aggregate} ea
5157
WHERE EXISTS(SELECT 1 FROM incremental_users iu WHERE iu.{entity_id} = ea.{entity_id})
5258
"""
5359

0 commit comments

Comments
 (0)