metabrainz
diff --git a/‎Dockerfile.spark
+1-1 b/‎Dockerfile.spark
+1-1
diff --git a/‎docker/Dockerfile.spark.base
+6-6 b/‎docker/Dockerfile.spark.base
+6-6
diff --git a/‎docker/start-spark-request-consumer.sh
+7-6 b/‎docker/start-spark-request-consumer.sh
+7-6
diff --git a/‎listenbrainz/db/stats.py
-1 b/‎listenbrainz/db/stats.py
-1
diff --git a/‎listenbrainz_spark/__init__.py
+6-7 b/‎listenbrainz_spark/__init__.py
+6-7
diff --git a/‎listenbrainz_spark/hdfs/utils.py
+2-58 b/‎listenbrainz_spark/hdfs/utils.py
+2-58
diff --git a/‎listenbrainz_spark/listens/cache.py
+1-1 b/‎listenbrainz_spark/listens/cache.py
+1-1
diff --git a/‎listenbrainz_spark/listens/compact.py
+2-2 b/‎listenbrainz_spark/listens/compact.py
+2-2
diff --git a/‎listenbrainz_spark/listens/data.py
+1-1 b/‎listenbrainz_spark/listens/data.py
+1-1
diff --git a/‎listenbrainz_spark/listens/dump.py
+2-2 b/‎listenbrainz_spark/listens/dump.py
+2-2
diff --git a/‎listenbrainz_spark/mlhd/download.py
+1-1 b/‎listenbrainz_spark/mlhd/download.py
+1-1
diff --git a/‎listenbrainz_spark/postgres/utils.py
+1-1 b/‎listenbrainz_spark/postgres/utils.py
+1-1
diff --git a/‎listenbrainz_spark/recommendations/recording/create_dataframes.py
+1-1 b/‎listenbrainz_spark/recommendations/recording/create_dataframes.py
+1-1
diff --git a/‎listenbrainz_spark/recommendations/recording/recommend.py
+1-1 b/‎listenbrainz_spark/recommendations/recording/recommend.py
+1-1
diff --git a/‎listenbrainz_spark/request_consumer/request_consumer.py
+9-15 b/‎listenbrainz_spark/request_consumer/request_consumer.py
+9-15
diff --git a/‎listenbrainz_spark/similarity/artist.py
+3-4 b/‎listenbrainz_spark/similarity/artist.py
+3-4
@@ -4,7 +4,7 @@ COPY docker/spark-cluster-config/test/core-site.xml $HADOOP_HOME/etc/hadoop/core
 COPY docker/spark-cluster-config/test/hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml
 COPY docker/spark-cluster-config/test/spark-env.sh $SPARK_HOME/conf/spark-env.sh
 
-RUN pip3 install pip==21.0.1
+RUN pip3 install pip==25.0.1 setuptools wheel
 
 WORKDIR /rec
 
 
@@ -1,4 +1,4 @@
-ARG PYTHON_BASE_IMAGE_VERSION=3.9-focal-20220315
+ARG PYTHON_BASE_IMAGE_VERSION=3.12-20241130
 FROM metabrainz/python:$PYTHON_BASE_IMAGE_VERSION
 
 ARG PYTHON_BASE_IMAGE_VERSION
@@ -26,9 +26,9 @@ RUN wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSI
 
 WORKDIR /usr/local
 
-ENV JAVA_VERSION 11.0.21
+ENV JAVA_VERSION 11.0.26
 ENV JAVA_MAJOR_VERSION 11
-ENV JAVA_BUILD_VERSION 9
+ENV JAVA_BUILD_VERSION 4
 RUN wget https://github.com/adoptium/temurin${JAVA_MAJOR_VERSION}-binaries/releases/download/jdk-${JAVA_VERSION}%2B${JAVA_BUILD_VERSION}/OpenJDK${JAVA_MAJOR_VERSION}U-jdk_x64_linux_hotspot_${JAVA_VERSION}_${JAVA_BUILD_VERSION}.tar.gz \
     && tar xzf OpenJDK${JAVA_MAJOR_VERSION}U-jdk_x64_linux_hotspot_${JAVA_VERSION}_${JAVA_BUILD_VERSION}.tar.gz \
     && mv jdk-${JAVA_VERSION}+${JAVA_BUILD_VERSION} /usr/local/jdk \
@@ -38,7 +38,7 @@ ENV PATH $JAVA_HOME/bin:$PATH
 
 COPY apache-download.sh /apache-download.sh
 
-ENV HADOOP_VERSION 3.3.5
+ENV HADOOP_VERSION 3.4.1
 RUN /apache-download.sh hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz \
     && tar xzf hadoop-${HADOOP_VERSION}.tar.gz \
     && mv hadoop-${HADOOP_VERSION} /usr/local/hadoop \
@@ -48,7 +48,7 @@ ENV PATH $HADOOP_HOME/bin:$PATH
 
 RUN mkdir /hdfs
 
-ENV SPARK_VERSION 3.4.0
+ENV SPARK_VERSION 3.5.5
 RUN /apache-download.sh spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz \
     && tar xzf spark-${SPARK_VERSION}-bin-without-hadoop.tgz \
     && mv spark-${SPARK_VERSION}-bin-without-hadoop /usr/local/spark \
@@ -57,6 +57,6 @@ ENV SPARK_HOME /usr/local/spark
 ENV PATH $SPARK_HOME/bin:$PATH
 ENV PYTHONPATH $SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$SPARK_HOME/python:$PYTHONPATH
 
-ENV POSTGRESQL_DRIVER_VERSION 42.7.0
+ENV POSTGRESQL_DRIVER_VERSION 42.7.5
 RUN wget -O postgresql-${POSTGRESQL_DRIVER_VERSION}.jar https://jdbc.postgresql.org/download/postgresql-${POSTGRESQL_DRIVER_VERSION}.jar \
     && mv postgresql-${POSTGRESQL_DRIVER_VERSION}.jar ${SPARK_HOME}/jars
@@ -5,14 +5,14 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../"
 
 rm -rf pyspark_venv pyspark_venv.tar.gz listenbrainz_spark_request_consumer.zip models.zip
 
-python3 -m venv pyspark_venv
+python3.13 -m venv pyspark_venv
 source pyspark_venv/bin/activate
-pip install -r requirements_spark.txt
-pip install venv-pack
+pip install --upgrade pip setuptools wheel venv-pack -r requirements_spark.txt
 venv-pack -o pyspark_venv.tar.gz
 
-export PYSPARK_DRIVER_PYTHON=python
-export PYSPARK_PYTHON=./environment/bin/python
+VENV_PATH="$(realpath pyspark_venv)"
+export PYSPARK_DRIVER_PYTHON="${VENV_PATH}/bin/python3.13"
+export PYSPARK_PYTHON=./environment/bin/python3.13
 
 GIT_COMMIT_SHA="$(git describe --tags --dirty --always)"
 echo "$GIT_COMMIT_SHA" > .git-version
@@ -25,8 +25,9 @@ source spark_config.sh
         --master spark://leader:7077 \
         --archives "pyspark_venv.tar.gz#environment" \
         --conf "spark.cores.max=$MAX_CORES" \
+        --conf "spark.driver.maxResultSize=$DRIVER_MAX_RESULT_SIZE" \
         --executor-cores "$EXECUTOR_CORES" \
         --executor-memory "$EXECUTOR_MEMORY" \
         --driver-memory "$DRIVER_MEMORY" \
         --py-files listenbrainz_spark_request_consumer.zip,models.zip \
-    spark_manage.py request_consumer
+        spark_manage.py
@@ -31,7 +31,6 @@
 from sentry_sdk import start_span
 
 from data.model.common_stat import StatApi
-from data.model.user_artist_map import UserArtistMapRecord
 from listenbrainz.db import couchdb
 from listenbrainz.db.couchdb import try_insert_data
 from listenbrainz.db.user import get_users_by_id
 
@@ -5,21 +5,20 @@
 _formatter = logging.Formatter("%(asctime)s %(name)-20s %(levelname)-8s %(message)s")
 _handler.setFormatter(_formatter)
 
-_logger = logging.getLogger("listenbrainz_spark")
+_logger = logging.getLogger(__name__)
 _logger.setLevel(logging.INFO)
 _logger.addHandler(_handler)
 
 import sentry_sdk
 
 from py4j.protocol import Py4JJavaError
-from pyspark.sql import SparkSession, SQLContext
+from pyspark.sql import SparkSession
 
 from listenbrainz_spark.exceptions import SparkSessionNotInitializedException
 from listenbrainz_spark import config
 
 session = None
 context = None
-sql_context = None
 
 
 def init_spark_session(app_name):
@@ -30,15 +29,16 @@ def init_spark_session(app_name):
     """
     if hasattr(config, "LOG_SENTRY"):  # attempt to initialize sentry_sdk only if configuration available
         sentry_sdk.init(**config.LOG_SENTRY)
-    global session, context, sql_context
+    global session, context
     try:
+        # readSideCharPadding enabled causes OOM when importing artist_country_code cache data
         session = SparkSession \
                 .builder \
                 .appName(app_name) \
+                .config("spark.sql.readSideCharPadding", "false") \
                 .getOrCreate()
         context = session.sparkContext
         context.setLogLevel("ERROR")
-        sql_context = SQLContext(context)
     except Py4JJavaError as err:
         raise SparkSessionNotInitializedException(app_name, err.java_exception)
 
@@ -51,7 +51,7 @@ def init_test_session(app_name):
     Set spark.driver.host to avoid tests from hanging (get_listens_from_dump hangs when taking union
     of full dump and incremental dump listens), see https://issues.apache.org/jira/browse/SPARK-16087
     """
-    global session, context, sql_context
+    global session, context
     try:
         session = SparkSession \
                 .builder \
@@ -69,6 +69,5 @@ def init_test_session(app_name):
                 .getOrCreate()
         context = session.sparkContext
         context.setLogLevel("ERROR")
-        sql_context = SQLContext(context)
     except Py4JJavaError as err:
         raise SparkSessionNotInitializedException(app_name, err.java_exception)
@@ -1,32 +1,12 @@
 import logging
-import os
 from pathlib import Path
 
-from hdfs.util import HdfsError
-
 from listenbrainz_spark import hdfs_connection
-from listenbrainz_spark.exceptions import (HDFSDirectoryNotDeletedException,
-                                           PathNotFoundException)
+from listenbrainz_spark.exceptions import HDFSDirectoryNotDeletedException
 
 logger = logging.getLogger(__name__)
 
 
-# A typical listen is of the form:
-# {
-#   "artist_mbids": [],
-#   "artist_name": "Cake",
-#   "listened_at": "2005-02-28T20:39:08Z",
-#   "recording_msid": "c559b2f8-41ff-4b55-ab3c-0b57d9b85d11",
-#   "recording_mbid": "1750f8ca-410e-4bdc-bf90-b0146cb5ee35",
-#   "release_mbid": "",
-#   "release_name": null,
-#   "tags": [],
-#   "track_name": "Tougher Than It Is"
-#   "user_id": 5,
-# }
-# All the keys in the dict are column/field names in a Spark dataframe.
-
-
 def create_dir(path):
     """ Creates a directory in HDFS.
         Args:
@@ -57,25 +37,7 @@ def path_exists(path):
             path (string): Path to check status for.
         Note: Caller is responsible for initializing HDFS connection.
     """
-    path_found = hdfs_connection.client.status(path, strict=False)
-    if path_found:
-        return True
-    return False
-
-
-def hdfs_walk(path, depth=0):
-    """ Depth-first walk of HDFS filesystem.
-        Args:
-            path (str): Path to start DFS.
-            depth (int): Maximum depth to explore files/folders. 0 for no limit.
-        Returns:
-            walk: a generator yeilding tuples (path, dirs, files).
-    """
-    try:
-        walk = hdfs_connection.client.walk(hdfs_path=path, depth=depth)
-        return walk
-    except HdfsError as err:
-        raise PathNotFoundException(str(err), path)
+    return hdfs_connection.client.status(path, strict=False)
 
 
 def upload_to_HDFS(hdfs_path, local_path):
@@ -96,24 +58,6 @@ def rename(hdfs_src_path: str, hdfs_dst_path: str):
     hdfs_connection.client.rename(hdfs_src_path, hdfs_dst_path)
 
 
-def copy(hdfs_src_path: str, hdfs_dst_path: str, overwrite: bool = False):
-    """ Copy a file or folder in HDFS
-        Args:
-            hdfs_src_path – Source path.
-            hdfs_dst_path – Destination path. If the path already exists and is a directory, the source will be copied into it.
-            overwrite - Wether to overwrite the path if it already exists.
-    """
-    walk = hdfs_walk(hdfs_src_path)
-
-    for (root, dirs, files) in walk:
-        for _file in files:
-            src_file_path = os.path.join(root, _file)
-            dst_file_path = os.path.join(hdfs_dst_path, os.path.relpath(src_file_path, hdfs_src_path))
-            with hdfs_connection.client.read(src_file_path) as reader:
-                with hdfs_connection.client.write(dst_file_path, overwrite=overwrite) as writer:
-                    writer.write(reader.read())
-
-
 def move(hdfs_src_path: str, hdfs_dest_path: str):
     """ Move a file or folder in HDFS """
     # Delete existing destination directory if any
 
@@ -1,7 +1,7 @@
 import os
 from typing import Optional
 
-from pandas import DataFrame
+from pyspark.sql import DataFrame
 
 from listenbrainz_spark.listens.metadata import get_listens_metadata
 from listenbrainz_spark.utils import read_files_from_HDFS
 
@@ -36,7 +36,7 @@ def write_partitioned_listens(table):
     new_base_listens_location = os.path.join(new_location, "base")
 
     listenbrainz_spark \
-        .sql_context \
+        .session \
         .sql(query) \
         .write \
         .partitionBy("year", "month") \
@@ -48,7 +48,7 @@ def write_partitioned_listens(table):
           from parquet.`{new_base_listens_location}`
     """
     result = listenbrainz_spark \
-        .sql_context \
+        .session \
         .sql(query) \
         .collect()[0]
 
 
@@ -112,7 +112,7 @@ def get_base_listens_df(location, start: datetime, end: datetime):
              , artist_credit_mbids
           from parquet.`{location}`
     """) + where_clause
-    return listenbrainz_spark.sql_context.sql(query)
+    return listenbrainz_spark.session.sql(query)
 
 
 def get_latest_listen_ts() -> datetime:
 
@@ -236,7 +236,7 @@ def process_incremental_listens_dump(temp_path):
       GROUP BY user_id
     """
     listenbrainz_spark \
-        .sql_context \
+        .session \
         .sql(query) \
         .repartition(1) \
         .write \
@@ -251,7 +251,7 @@ def process_incremental_listens_dump(temp_path):
               from parquet.`{inc_listens_location}`
         """
         result = listenbrainz_spark \
-            .sql_context \
+            .session \
             .sql(query) \
             .collect()[0]
         update_listens_metadata(location, result.max_listened_at, result.max_created)
@@ -43,7 +43,7 @@ def post_process_mlhd_plus():
     """
     for chunk in MLHD_PLUS_CHUNKS:
         listenbrainz_spark\
-            .sql_context\
+            .session\
             .read\
             .format("parquet")\
             .option("pathGlobFilter", f"{chunk}*.parquet")\
 
@@ -7,7 +7,7 @@
 
 def load_from_db(url, user, password, query):
     return listenbrainz_spark\
-        .sql_context\
+        .session\
         .read\
         .format("jdbc")\
         .option("url", url)\
 
@@ -169,7 +169,7 @@ def save_playcounts_df(listens_df, recordings_df, users_df, metadata, save_path)
                               .agg(func.count('recording_id').alias('playcount'))
     playcounts_df.createOrReplaceTempView("playcounts")
 
-    transformed_listencounts = listenbrainz_spark.sql_context.sql(f"""
+    transformed_listencounts = listenbrainz_spark.session.sql(f"""
             SELECT spark_user_id
                  , recording_id
                  , playcount
 
@@ -40,7 +40,7 @@ def get_most_recent_model_meta():
             model_id (str): Model identification string.
     """
     utils.read_files_from_HDFS(path.RECOMMENDATION_RECORDING_MODEL_METADATA).createOrReplaceTempView("model_metadata")
-    meta = listenbrainz_spark.sql_context.sql("""
+    meta = listenbrainz_spark.session.sql("""
         SELECT model_id, model_html_file
           FROM model_metadata
       ORDER BY model_created DESC
 
@@ -102,12 +102,15 @@ def push_to_result_queue(self, messages):
             logger.info("No messages calculated")
 
     def callback(self, message: Message):
-        request = json.loads(message.body)
-        logger.info('Received a request!')
-        messages = self.get_result(request)
-        if messages:
-            self.push_to_result_queue(messages)
-        logger.info('Request done!')
+        try:
+            request = json.loads(message.body)
+            logger.info('Received a request!')
+            messages = self.get_result(request)
+            if messages:
+                self.push_to_result_queue(messages)
+            logger.info('Request done!')
+        except Exception as e:
+            logger.error("Error while processing request: %s", str(e), exc_info=True)
 
     def get_consumers(self, _, channel):
         return [
@@ -135,12 +138,3 @@ def start(self, app_name):
             except Exception as e:
                 logger.critical("Error in spark-request-consumer: %s", str(e), exc_info=True)
                 time.sleep(2)
-
-
-def main(app_name):
-    rc = RequestConsumer()
-    rc.start(app_name)
-
-
-if __name__ == '__main__':
-    main('spark-writer')
@@ -2,11 +2,10 @@
 
 from more_itertools import chunked
 
-import listenbrainz_spark
-from listenbrainz_spark import config
 from listenbrainz_spark.path import RECORDING_LENGTH_DATAFRAME, ARTIST_CREDIT_MBID_DATAFRAME
 from listenbrainz_spark.stats import run_query
 from listenbrainz_spark.listens.data import get_listens_from_dump
+from listenbrainz_spark.utils import read_files_from_HDFS
 
 RECORDINGS_PER_MESSAGE = 10000
 # the duration value in seconds to use for track whose duration data in not available in MB
@@ -128,10 +127,10 @@ def main(days, session, contribution, threshold, limit, skip, is_production_data
 
     get_listens_from_dump(from_date, to_date).createOrReplaceTempView(table)
 
-    metadata_df = listenbrainz_spark.sql_context.read.parquet(config.HDFS_CLUSTER_URI + RECORDING_LENGTH_DATAFRAME)
+    metadata_df = read_files_from_HDFS(RECORDING_LENGTH_DATAFRAME)
     metadata_df.createOrReplaceTempView(metadata_table)
 
-    artist_credit_df = listenbrainz_spark.sql_context.read.parquet(config.HDFS_CLUSTER_URI + ARTIST_CREDIT_MBID_DATAFRAME)
+    artist_credit_df = read_files_from_HDFS(ARTIST_CREDIT_MBID_DATAFRAME)
     artist_credit_df.createOrReplaceTempView(artist_credit_table)
 
     skip_threshold = -skip