From 6533960a5a00a10b00598c9394061219138e519d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 24 Mar 2025 20:48:07 +0000 Subject: [PATCH 1/2] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.8.1 → v0.11.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.8.1...v0.11.2) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ec4e222bbc..db89731a85 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,7 +11,7 @@ repos: - --fuzzy-match-generates-todo - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.1 + rev: v0.11.2 hooks: - id: ruff args: [--fix] From b580539f54f82d2f01ad8c74cb572cb5462eee1f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 24 Mar 2025 20:48:35 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/distilabel/distiset.py | 6 +-- src/distilabel/pipeline/batch_manager.py | 11 +++-- src/distilabel/pipeline/write_buffer.py | 6 +-- src/distilabel/steps/base.py | 2 +- src/distilabel/steps/tasks/base.py | 2 +- .../steps/tasks/improving_text_embeddings.py | 4 +- .../steps/tasks/math_shepherd/completer.py | 2 +- src/distilabel/steps/tasks/text_generation.py | 2 +- .../utils/mkdocs/components_gallery.py | 12 +++--- tests/unit/models/embeddings/test_llamacpp.py | 42 +++++++++---------- 10 files changed, 43 insertions(+), 46 deletions(-) diff --git a/src/distilabel/distiset.py b/src/distilabel/distiset.py index f44d20a3ab..ca4df6eab1 100644 --- a/src/distilabel/distiset.py +++ b/src/distilabel/distiset.py @@ -509,9 +509,9 @@ def load_from_disk( ) dest_distiset_path = distiset_path - assert fs.isdir( - original_distiset_path - ), "`distiset_path` must be a `PathLike` object pointing to a folder or a URI of a remote filesystem." + assert fs.isdir(original_distiset_path), ( + "`distiset_path` must be a `PathLike` object pointing to a folder or a URI of a remote filesystem." + ) has_config = False has_artifacts = False diff --git a/src/distilabel/pipeline/batch_manager.py b/src/distilabel/pipeline/batch_manager.py index c150d09916..c18719abb6 100644 --- a/src/distilabel/pipeline/batch_manager.py +++ b/src/distilabel/pipeline/batch_manager.py @@ -231,10 +231,10 @@ def from_step( input_batch_size=getattr(step, "input_batch_size", None), data={predecessor: [] for predecessor in predecessors}, convergence_step=convergence_step, - next_expected_seq_no={predecessor: (0, 0) for predecessor in predecessors}, + next_expected_seq_no=dict.fromkeys(predecessors, (0, 0)), step_signature=step.signature, use_cache=step.use_cache, - step_offset={predecessor: (0, 0) for predecessor in predecessors}, + step_offset=dict.fromkeys(predecessors, (0, 0)), ) def _get_seq_no(self) -> int: @@ -314,10 +314,9 @@ def _get_data_for_convergence_step( seq_no, batches = grouped_batches[0] str_seq_no = str(seq_no) - remaining_rows_per_step: Dict[str, int] = { - step_name: self.input_batch_size - for step_name in self.data # type: ignore - } + remaining_rows_per_step: Dict[str, int] = dict.fromkeys( + self.data, self.input_batch_size + ) batches_used = defaultdict(list) data = defaultdict(list) for batch, batch_size in batches: diff --git a/src/distilabel/pipeline/write_buffer.py b/src/distilabel/pipeline/write_buffer.py index 3fdb037e14..0ec3799dcc 100644 --- a/src/distilabel/pipeline/write_buffer.py +++ b/src/distilabel/pipeline/write_buffer.py @@ -65,11 +65,9 @@ def __init__( step: [] for step in leaf_steps } # TODO: make this configurable - self._buffers_dump_batch_size: Dict[str, int] = { - step: 50 for step in leaf_steps - } + self._buffers_dump_batch_size: Dict[str, int] = dict.fromkeys(leaf_steps, 50) self._buffer_last_schema = {} - self._buffers_last_file: Dict[str, int] = {step: 1 for step in leaf_steps} + self._buffers_last_file: Dict[str, int] = dict.fromkeys(leaf_steps, 1) self._steps_cached = steps_cached or {} self._logger = logging.getLogger("distilabel.write_buffer") diff --git a/src/distilabel/steps/base.py b/src/distilabel/steps/base.py index 88bed374bc..d7628feb62 100644 --- a/src/distilabel/steps/base.py +++ b/src/distilabel/steps/base.py @@ -101,7 +101,7 @@ def _infer_step_name( base_name = "_".join(parts[:-1]) while name in step_names: idx = int(name.split("_")[-1]) - name = f"{base_name}_{idx+1}" + name = f"{base_name}_{idx + 1}" return name diff --git a/src/distilabel/steps/tasks/base.py b/src/distilabel/steps/tasks/base.py index 3a575545d1..2ae43cbc38 100644 --- a/src/distilabel/steps/tasks/base.py +++ b/src/distilabel/steps/tasks/base.py @@ -211,7 +211,7 @@ def _output_on_failure( a new field `distilabel_meta` with the raw output of the LLM. """ # Create a dictionary with the outputs of the task (every output set to None) - outputs = {output: None for output in self.outputs} + outputs = dict.fromkeys(self.outputs) outputs["model_name"] = self.llm.model_name # type: ignore outputs = self._create_metadata( outputs, diff --git a/src/distilabel/steps/tasks/improving_text_embeddings.py b/src/distilabel/steps/tasks/improving_text_embeddings.py index 8569c12810..eeb0897c1e 100644 --- a/src/distilabel/steps/tasks/improving_text_embeddings.py +++ b/src/distilabel/steps/tasks/improving_text_embeddings.py @@ -66,7 +66,7 @@ def format_output( A Python dictionary with the parsed output based on the `keys` property. """ if output is None: - return {key: None for key in self.keys} + return dict.fromkeys(self.keys) def escape_backslashes_in_values(s): # Regular expression to match the key-value pairs in the dictionary @@ -100,7 +100,7 @@ def replace_backslashes(match): pass if not isinstance(output, dict): - return {key: None for key in self.keys} + return dict.fromkeys(self.keys) return {key: output.get(key, None) for key in self.keys} diff --git a/src/distilabel/steps/tasks/math_shepherd/completer.py b/src/distilabel/steps/tasks/math_shepherd/completer.py index 05ff410ac5..e43f2424c8 100644 --- a/src/distilabel/steps/tasks/math_shepherd/completer.py +++ b/src/distilabel/steps/tasks/math_shepherd/completer.py @@ -485,7 +485,7 @@ def _auto_label( self._logger.info("Completer failed due to empty completion") continue if completion[-1] == golden_answers[instruction_i]: - label = f" { self.tags[0]}" + label = f" {self.tags[0]}" # If we found one, it's enough as we are doing Hard Estimation continue # In case we had no solutions from the previous step, otherwise we would have diff --git a/src/distilabel/steps/tasks/text_generation.py b/src/distilabel/steps/tasks/text_generation.py index 59cf932423..905a49d8ee 100644 --- a/src/distilabel/steps/tasks/text_generation.py +++ b/src/distilabel/steps/tasks/text_generation.py @@ -229,7 +229,7 @@ def unload(self) -> None: @property def inputs(self) -> "StepColumns": """The input for the task is the `instruction` by default, or the `columns` given as input.""" - columns = {column: True for column in self.columns} + columns = dict.fromkeys(self.columns, True) columns["system_prompt"] = False return columns diff --git a/src/distilabel/utils/mkdocs/components_gallery.py b/src/distilabel/utils/mkdocs/components_gallery.py index 77225b9baa..d663ec4ce6 100644 --- a/src/distilabel/utils/mkdocs/components_gallery.py +++ b/src/distilabel/utils/mkdocs/components_gallery.py @@ -296,9 +296,9 @@ def _generate_steps_pages(self, src_dir: Path, steps: list) -> List[str]: docstring["icon"] = _STEPS_CATEGORY_TO_ICON.get(first_category, "") if docstring["icon"]: - assert ( - docstring["icon"] in _STEPS_CATEGORY_TO_ICON.values() - ), f"Icon {docstring['icon']} not found in _STEPS_CATEGORY_TO_ICON" + assert docstring["icon"] in _STEPS_CATEGORY_TO_ICON.values(), ( + f"Icon {docstring['icon']} not found in _STEPS_CATEGORY_TO_ICON" + ) name = step["name"] @@ -364,9 +364,9 @@ def _generate_tasks_pages(self, src_dir: Path, tasks: list) -> List[str]: first_category = docstring["categories"][0] docstring["icon"] = _STEPS_CATEGORY_TO_ICON.get(first_category, "") if docstring["icon"]: - assert ( - docstring["icon"] in _STEPS_CATEGORY_TO_ICON.values() - ), f"Icon {docstring['icon']} not found in _STEPS_CATEGORY_TO_ICON" + assert docstring["icon"] in _STEPS_CATEGORY_TO_ICON.values(), ( + f"Icon {docstring['icon']} not found in _STEPS_CATEGORY_TO_ICON" + ) name = task["name"] diff --git a/tests/unit/models/embeddings/test_llamacpp.py b/tests/unit/models/embeddings/test_llamacpp.py index b219ac7798..99a7c77902 100644 --- a/tests/unit/models/embeddings/test_llamacpp.py +++ b/tests/unit/models/embeddings/test_llamacpp.py @@ -115,9 +115,9 @@ def test_normalize_embeddings(self, test_inputs): for result in results: # Check if the embedding is normalized (L2 norm should be close to 1) norm = np.linalg.norm(result) - assert np.isclose( - norm, 1.0, atol=1e-6 - ), f"Norm is {norm}, expected close to 1.0" + assert np.isclose(norm, 1.0, atol=1e-6), ( + f"Norm is {norm}, expected close to 1.0" + ) def test_normalize_embeddings_false(self, test_inputs): """ @@ -129,15 +129,15 @@ def test_normalize_embeddings_false(self, test_inputs): for result in results: # Check if the embedding is not normalized (L2 norm should not be close to 1) norm = np.linalg.norm(result) - assert not np.isclose( - norm, 1.0, atol=1e-6 - ), f"Norm is {norm}, expected not close to 1.0" + assert not np.isclose(norm, 1.0, atol=1e-6), ( + f"Norm is {norm}, expected not close to 1.0" + ) # Additional check: ensure that at least one embedding has a norm significantly different from 1 norms = [np.linalg.norm(result) for result in results] - assert any( - not np.isclose(norm, 1.0, atol=0.1) for norm in norms - ), "Expected at least one embedding with norm not close to 1.0" + assert any(not np.isclose(norm, 1.0, atol=0.1) for norm in norms), ( + "Expected at least one embedding with norm not close to 1.0" + ) def test_encode_batch(self) -> None: """ @@ -149,20 +149,20 @@ def test_encode_batch(self) -> None: inputs = [f"This is test sentence {i}" for i in range(batch_size)] results = self.embeddings.encode(inputs=inputs) - assert ( - len(results) == batch_size - ), f"Expected {batch_size} results, got {len(results)}" + assert len(results) == batch_size, ( + f"Expected {batch_size} results, got {len(results)}" + ) for result in results: - assert ( - len(result) == 384 - ), f"Expected embedding dimension 384, got {len(result)}" + assert len(result) == 384, ( + f"Expected embedding dimension 384, got {len(result)}" + ) # Test with a large batch to ensure it doesn't cause issues large_batch = ["Large batch test" for _ in range(100)] large_results = self.embeddings.encode(inputs=large_batch) - assert ( - len(large_results) == 100 - ), f"Expected 100 results for large batch, got {len(large_results)}" + assert len(large_results) == 100, ( + f"Expected 100 results for large batch, got {len(large_results)}" + ) def test_encode_batch_consistency(self) -> None: """ @@ -180,6 +180,6 @@ def test_encode_batch_consistency(self) -> None: batch_result = self.embeddings.encode([input_text, "Another sentence"])[0] # Compare the embeddings - assert np.allclose( - single_result, batch_result, atol=1e-5 - ), "Embeddings are not consistent between single and batch processing" + assert np.allclose(single_result, batch_result, atol=1e-5), ( + "Embeddings are not consistent between single and batch processing" + )