Skip to content

Commit 240c694

Browse files
committed
Use zstd compression for parquet dumps
Spark/parquet dumps have been growing a lot in size lately. Having tried multiple failed ways to reduce the size while keeping the existing snapping compression switching to zstd compression which seems to be almost twice as efficient on the test set.
1 parent c5fe341 commit 240c694

File tree

1 file changed

+3
-3
lines changed

1 file changed

+3
-3
lines changed

listenbrainz/listenstore/dump_listenstore.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
# These values are defined to create spark parquet files that are at most 128MB in size.
2929
# This compression ration allows us to roughly estimate how full we can make files before starting a new one
30-
PARQUET_APPROX_COMPRESSION_RATIO = .57
30+
PARQUET_APPROX_COMPRESSION_RATIO = 0.25
3131

3232
# This is the approximate amount of data to write to a parquet file in order to meet the max size
3333
PARQUET_TARGET_SIZE = 134217728 / PARQUET_APPROX_COMPRESSION_RATIO # 128MB / compression ratio
@@ -44,7 +44,7 @@
4444
pa.field("release_mbid", pa.string(), True),
4545
pa.field("recording_name", pa.string(), False),
4646
pa.field("recording_mbid", pa.string(), True),
47-
pa.field('artist_credit_mbids', pa.list_(pa.string()), True),
47+
pa.field("artist_credit_mbids", pa.list_(pa.string()), True),
4848
])
4949

5050

@@ -462,7 +462,7 @@ def write_parquet_files(self,
462462
# Create a pandas dataframe, then write that to a parquet files
463463
df = pd.DataFrame(data, dtype=object)
464464
table = pa.Table.from_pandas(df, schema=SPARK_LISTENS_SCHEMA, preserve_index=False)
465-
pq.write_table(table, filename, flavor="spark")
465+
pq.write_table(table, filename, flavor="spark", compression="zstd")
466466
file_size = os.path.getsize(filename)
467467
tar_file.add(filename, arcname=os.path.join(archive_dir, "%d.parquet" % parquet_file_id))
468468
os.unlink(filename)

0 commit comments

Comments
 (0)