|
27 | 27 |
|
28 | 28 | # These values are defined to create spark parquet files that are at most 128MB in size.
|
29 | 29 | # This compression ration allows us to roughly estimate how full we can make files before starting a new one
|
30 |
| -PARQUET_APPROX_COMPRESSION_RATIO = .57 |
| 30 | +PARQUET_APPROX_COMPRESSION_RATIO = 0.25 |
31 | 31 |
|
32 | 32 | # This is the approximate amount of data to write to a parquet file in order to meet the max size
|
33 | 33 | PARQUET_TARGET_SIZE = 134217728 / PARQUET_APPROX_COMPRESSION_RATIO # 128MB / compression ratio
|
|
44 | 44 | pa.field("release_mbid", pa.string(), True),
|
45 | 45 | pa.field("recording_name", pa.string(), False),
|
46 | 46 | pa.field("recording_mbid", pa.string(), True),
|
47 |
| - pa.field('artist_credit_mbids', pa.list_(pa.string()), True), |
| 47 | + pa.field("artist_credit_mbids", pa.list_(pa.string()), True), |
48 | 48 | ])
|
49 | 49 |
|
50 | 50 |
|
@@ -462,7 +462,7 @@ def write_parquet_files(self,
|
462 | 462 | # Create a pandas dataframe, then write that to a parquet files
|
463 | 463 | df = pd.DataFrame(data, dtype=object)
|
464 | 464 | table = pa.Table.from_pandas(df, schema=SPARK_LISTENS_SCHEMA, preserve_index=False)
|
465 |
| - pq.write_table(table, filename, flavor="spark") |
| 465 | + pq.write_table(table, filename, flavor="spark", compression="zstd") |
466 | 466 | file_size = os.path.getsize(filename)
|
467 | 467 | tar_file.add(filename, arcname=os.path.join(archive_dir, "%d.parquet" % parquet_file_id))
|
468 | 468 | os.unlink(filename)
|
|
0 commit comments