WordPress
diff --git a/‎catalog/DAGs.md
Lines changed: 76 additions & 0 deletions b/‎catalog/DAGs.md
Lines changed: 76 additions & 0 deletions
diff --git a/‎catalog/dags/database/batched_update/batched_update.py
Lines changed: 147 additions & 0 deletions b/‎catalog/dags/database/batched_update/batched_update.py
Lines changed: 147 additions & 0 deletions
@@ -42,6 +42,7 @@ The following are DAGs grouped by their primary tag:
 
 | DAG ID                                                                            | Schedule Interval |
 | --------------------------------------------------------------------------------- | ----------------- |
+| [`batched_update`](#batched_update)                                               | `None`            |
 | [`recreate_audio_popularity_calculation`](#recreate_audio_popularity_calculation) | `None`            |
 | [`recreate_image_popularity_calculation`](#recreate_image_popularity_calculation) | `None`            |
 | [`report_pending_reported_media`](#report_pending_reported_media)                 | `@weekly`         |
@@ -112,6 +113,7 @@ The following is documentation associated with each DAG (where available):
 1.  [`add_license_url`](#add_license_url)
 1.  [`airflow_log_cleanup`](#airflow_log_cleanup)
 1.  [`audio_data_refresh`](#audio_data_refresh)
+1.  [`batched_update`](#batched_update)
 1.  [`check_silenced_dags`](#check_silenced_dags)
 1.  [`create_filtered_audio_index`](#create_filtered_audio_index)
 1.  [`create_filtered_image_index`](#create_filtered_image_index)
@@ -219,6 +221,80 @@ and related PRs:
 - [[Feature] Data refresh orchestration DAG](https://github.com/WordPress/openverse-catalog/issues/353)
 - [[Feature] Merge popularity calculations and data refresh into a single DAG](https://github.com/WordPress/openverse-catalog/issues/453)
 
+## `batched_update`
+
+Batched Update DAG
+
+This DAG is used to run a batched SQL update on a media table in the Catalog
+database. It is automatically triggered by the `popularity_refresh` DAGs to
+refresh popularity data using newly calculated constants, but can also be
+triggered manually with custom SQL operations.
+
+The DAG must be run with a valid dag_run configuration specifying the SQL
+commands to be run. The DAG will then split the rows to be updated into batches,
+and report to Slack when all batches have been updated. It handles all
+deadlocking and timeout concerns, ensuring that the provided SQL is run without
+interfering with ingestion. For more information, see the implementation plan:
+https://docs.openverse.org/projects/proposals/popularity_optimizations/20230420-implementation_plan_popularity_optimizations.html#special-considerations-avoiding-deadlocks-and-timeouts
+
+By default the DAG will run as a dry_run, logging the generated SQL but not
+actually running it. To actually perform the update, the `dry_run` parameter
+must be explicitly set to `false` in the configuration.
+
+Required Dagrun Configuration parameters:
+
+- query_id: a string identifier which will be appended to temporary table used
+  in the update
+- table_name: the name of the table to update. Must be a valid media table
+- select_query: a SQL `WHERE` clause used to select the rows that will be
+  updated
+- update_query: the SQL `UPDATE` expression to be run on all selected rows
+
+Optional params:
+
+- dry_run: bool, whether to actually run the generated SQL. True by default.
+- batch_size: int number of records to process in each batch. By default, 10_000
+- update_timeout: int number of seconds to run an individual batch update before
+  timing out. By default, 3600 (or one hour)
+- batch_start: int index into the temp table at which to start the update. By
+  default, this is 0 and all rows in the temp table are updated.
+- resume_update: boolean indicating whether to attempt to resume an update using
+  an existing temp table matching the `query_id`. When True, a new temp table is
+  not created.
+
+An example dag_run configuration used to set the thumbnails of all Flickr images
+to null would look like this:
+
+```
+{
+    "query_id": "my_flickr_query",
+    "table_name": "image",
+    "select_query": "WHERE provider='flickr'",
+    "update_query": "SET thumbnail=null",
+    "batch_size": 10,
+    "dry_run": false
+}
+```
+
+It is possible to resume an update from an arbitrary starting point on an
+existing temp table, for example if a DAG succeeds in creating the temp table
+but fails midway through the update. To do so, set the `resume_update` param to
+True and select your desired `batch_start`. For instance, if the example DAG
+given above failed after processing the first 50_000 records, you might run:
+
+```
+{
+    "query_id": "my_flickr_query",
+    "table_name": "image",
+    "select_query": "WHERE provider='flickr'",
+    "update_query": "SET thumbnail=null",
+    "batch_size": 10,
+    "batch_start": 50000,
+    "resume_update": true,
+    "dry_run": false
+}
+```
+
 ## `check_silenced_dags`
 
 ### Silenced DAGs check
 
@@ -0,0 +1,147 @@
+import logging
+from datetime import timedelta
+
+from airflow.decorators import task
+from airflow.models.abstractoperator import AbstractOperator
+
+from common import slack
+from common.constants import POSTGRES_CONN_ID
+from common.sql import PostgresHook
+from database.batched_update import constants
+
+
+logger = logging.getLogger(__name__)
+
+
+def _single_value(cursor):
+    try:
+        row = cursor.fetchone()
+        return row[0]
+    except Exception as e:
+        raise ValueError("Unable to extract expected row data from cursor") from e
+
+
+@task.branch
+def resume_update(
+    resume_update: bool,
+):
+    """
+    Return True to short circuit temp table creation if this DagRun is
+    resuming from an existing temp table.
+    """
+    if resume_update:
+        # Skip table creation and indexing
+        return constants.GET_EXPECTED_COUNT_TASK_ID
+    return constants.CREATE_TEMP_TABLE_TASK_ID
+
+
+@task
+def get_expected_update_count(query_id: str, batch_start: int | None, dry_run: bool):
+    """
+    Get the number of records left to update, when resuming an update
+    on an existing temp table.
+    """
+    total_count = run_sql.function(
+        dry_run=dry_run,
+        sql_template=constants.SELECT_TEMP_TABLE_QUERY,
+        query_id=query_id,
+        handler=_single_value,
+    )
+
+    if batch_start:
+        total_count -= batch_start
+    return max(total_count, 0)
+
+
+@task
+def run_sql(
+    dry_run: bool,
+    sql_template: str,
+    query_id: str,
+    log_sql: bool = True,
+    postgres_conn_id: str = POSTGRES_CONN_ID,
+    task: AbstractOperator = None,
+    timeout: timedelta = None,
+    handler: callable = constants.RETURN_ROW_COUNT,
+    **kwargs,
+):
+    query = sql_template.format(
+        temp_table_name=constants.TEMP_TABLE_NAME.format(query_id=query_id), **kwargs
+    )
+    if dry_run:
+        logger.info(
+            "This is a dry run: no SQL will be executed. To perform the updates,"
+            " rerun the DAG with the conf option `'dry_run': false`."
+        )
+        logger.info(query)
+        return 0
+
+    postgres = PostgresHook(
+        postgres_conn_id=postgres_conn_id,
+        default_statement_timeout=(
+            timeout if timeout else PostgresHook.get_execution_timeout(task)
+        ),
+        log_sql=log_sql,
+    )
+
+    return postgres.run(query, handler=handler)
+
+
+@task
+def update_batches(
+    expected_row_count: int,
+    batch_size: int,
+    dry_run: bool,
+    table_name: str,
+    query_id: str,
+    update_query: str,
+    update_timeout: int,
+    batch_start: int = 0,
+    postgres_conn_id: str = POSTGRES_CONN_ID,
+    task: AbstractOperator = None,
+    **kwargs,
+):
+    updated_count = 0
+    if batch_start is None:
+        batch_start = 0
+
+    while batch_start <= expected_row_count:
+        batch_end = batch_start + batch_size
+
+        logger.info(f"Updating rows with id {batch_start} through {batch_end}.")
+        count = run_sql.function(
+            dry_run=dry_run,
+            sql_template=constants.UPDATE_BATCH_QUERY,
+            query_id=query_id,
+            # Only log the query the first time, so as not to spam the logs
+            log_sql=batch_start == 1,
+            postgres_conn_id=postgres_conn_id,
+            task=task,
+            timeout=update_timeout,
+            table_name=table_name,
+            update_query=update_query,
+            batch_start=batch_start,
+            batch_end=batch_end,
+        )
+
+        updated_count += count
+        batch_start = batch_end
+        logger.info(
+            f"Updated {updated_count} rows. {expected_row_count - updated_count}"
+            " remaining."
+        )
+
+    return updated_count
+
+
+@task
+def notify_slack(text: str, dry_run: bool) -> None:
+    if not dry_run:
+        slack.send_message(
+            text,
+            username=constants.SLACK_USERNAME,
+            icon_emoji=constants.SLACK_ICON,
+            dag_id=constants.DAG_ID,
+        )
+    else:
+        logger.info(text)