Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pre-commit.ci] pre-commit autoupdate #1128

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ repos:
- --fuzzy-match-generates-todo

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.8.1
rev: v0.11.2
hooks:
- id: ruff
args: [--fix]
Expand Down
6 changes: 3 additions & 3 deletions src/distilabel/distiset.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,9 +509,9 @@ def load_from_disk(
)
dest_distiset_path = distiset_path

assert fs.isdir(
original_distiset_path
), "`distiset_path` must be a `PathLike` object pointing to a folder or a URI of a remote filesystem."
assert fs.isdir(original_distiset_path), (
"`distiset_path` must be a `PathLike` object pointing to a folder or a URI of a remote filesystem."
)

has_config = False
has_artifacts = False
Expand Down
11 changes: 5 additions & 6 deletions src/distilabel/pipeline/batch_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,10 +231,10 @@ def from_step(
input_batch_size=getattr(step, "input_batch_size", None),
data={predecessor: [] for predecessor in predecessors},
convergence_step=convergence_step,
next_expected_seq_no={predecessor: (0, 0) for predecessor in predecessors},
next_expected_seq_no=dict.fromkeys(predecessors, (0, 0)),
step_signature=step.signature,
use_cache=step.use_cache,
step_offset={predecessor: (0, 0) for predecessor in predecessors},
step_offset=dict.fromkeys(predecessors, (0, 0)),
)

def _get_seq_no(self) -> int:
Expand Down Expand Up @@ -314,10 +314,9 @@ def _get_data_for_convergence_step(
seq_no, batches = grouped_batches[0]
str_seq_no = str(seq_no)

remaining_rows_per_step: Dict[str, int] = {
step_name: self.input_batch_size
for step_name in self.data # type: ignore
}
remaining_rows_per_step: Dict[str, int] = dict.fromkeys(
self.data, self.input_batch_size
)
batches_used = defaultdict(list)
data = defaultdict(list)
for batch, batch_size in batches:
Expand Down
6 changes: 2 additions & 4 deletions src/distilabel/pipeline/write_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,9 @@ def __init__(
step: [] for step in leaf_steps
}
# TODO: make this configurable
self._buffers_dump_batch_size: Dict[str, int] = {
step: 50 for step in leaf_steps
}
self._buffers_dump_batch_size: Dict[str, int] = dict.fromkeys(leaf_steps, 50)
self._buffer_last_schema = {}
self._buffers_last_file: Dict[str, int] = {step: 1 for step in leaf_steps}
self._buffers_last_file: Dict[str, int] = dict.fromkeys(leaf_steps, 1)
self._steps_cached = steps_cached or {}
self._logger = logging.getLogger("distilabel.write_buffer")

Expand Down
2 changes: 1 addition & 1 deletion src/distilabel/steps/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def _infer_step_name(
base_name = "_".join(parts[:-1])
while name in step_names:
idx = int(name.split("_")[-1])
name = f"{base_name}_{idx+1}"
name = f"{base_name}_{idx + 1}"
return name


Expand Down
2 changes: 1 addition & 1 deletion src/distilabel/steps/tasks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def _output_on_failure(
a new field `distilabel_meta` with the raw output of the LLM.
"""
# Create a dictionary with the outputs of the task (every output set to None)
outputs = {output: None for output in self.outputs}
outputs = dict.fromkeys(self.outputs)
outputs["model_name"] = self.llm.model_name # type: ignore
outputs = self._create_metadata(
outputs,
Expand Down
4 changes: 2 additions & 2 deletions src/distilabel/steps/tasks/improving_text_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def format_output(
A Python dictionary with the parsed output based on the `keys` property.
"""
if output is None:
return {key: None for key in self.keys}
return dict.fromkeys(self.keys)

def escape_backslashes_in_values(s):
# Regular expression to match the key-value pairs in the dictionary
Expand Down Expand Up @@ -100,7 +100,7 @@ def replace_backslashes(match):
pass

if not isinstance(output, dict):
return {key: None for key in self.keys}
return dict.fromkeys(self.keys)

return {key: output.get(key, None) for key in self.keys}

Expand Down
2 changes: 1 addition & 1 deletion src/distilabel/steps/tasks/math_shepherd/completer.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@ def _auto_label(
self._logger.info("Completer failed due to empty completion")
continue
if completion[-1] == golden_answers[instruction_i]:
label = f" { self.tags[0]}"
label = f" {self.tags[0]}"
# If we found one, it's enough as we are doing Hard Estimation
continue
# In case we had no solutions from the previous step, otherwise we would have
Expand Down
2 changes: 1 addition & 1 deletion src/distilabel/steps/tasks/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def unload(self) -> None:
@property
def inputs(self) -> "StepColumns":
"""The input for the task is the `instruction` by default, or the `columns` given as input."""
columns = {column: True for column in self.columns}
columns = dict.fromkeys(self.columns, True)
columns["system_prompt"] = False
return columns

Expand Down
12 changes: 6 additions & 6 deletions src/distilabel/utils/mkdocs/components_gallery.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,9 +296,9 @@ def _generate_steps_pages(self, src_dir: Path, steps: list) -> List[str]:
docstring["icon"] = _STEPS_CATEGORY_TO_ICON.get(first_category, "")

if docstring["icon"]:
assert (
docstring["icon"] in _STEPS_CATEGORY_TO_ICON.values()
), f"Icon {docstring['icon']} not found in _STEPS_CATEGORY_TO_ICON"
assert docstring["icon"] in _STEPS_CATEGORY_TO_ICON.values(), (
f"Icon {docstring['icon']} not found in _STEPS_CATEGORY_TO_ICON"
)

name = step["name"]

Expand Down Expand Up @@ -364,9 +364,9 @@ def _generate_tasks_pages(self, src_dir: Path, tasks: list) -> List[str]:
first_category = docstring["categories"][0]
docstring["icon"] = _STEPS_CATEGORY_TO_ICON.get(first_category, "")
if docstring["icon"]:
assert (
docstring["icon"] in _STEPS_CATEGORY_TO_ICON.values()
), f"Icon {docstring['icon']} not found in _STEPS_CATEGORY_TO_ICON"
assert docstring["icon"] in _STEPS_CATEGORY_TO_ICON.values(), (
f"Icon {docstring['icon']} not found in _STEPS_CATEGORY_TO_ICON"
)

name = task["name"]

Expand Down
42 changes: 21 additions & 21 deletions tests/unit/models/embeddings/test_llamacpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,9 @@ def test_normalize_embeddings(self, test_inputs):
for result in results:
# Check if the embedding is normalized (L2 norm should be close to 1)
norm = np.linalg.norm(result)
assert np.isclose(
norm, 1.0, atol=1e-6
), f"Norm is {norm}, expected close to 1.0"
assert np.isclose(norm, 1.0, atol=1e-6), (
f"Norm is {norm}, expected close to 1.0"
)

def test_normalize_embeddings_false(self, test_inputs):
"""
Expand All @@ -129,15 +129,15 @@ def test_normalize_embeddings_false(self, test_inputs):
for result in results:
# Check if the embedding is not normalized (L2 norm should not be close to 1)
norm = np.linalg.norm(result)
assert not np.isclose(
norm, 1.0, atol=1e-6
), f"Norm is {norm}, expected not close to 1.0"
assert not np.isclose(norm, 1.0, atol=1e-6), (
f"Norm is {norm}, expected not close to 1.0"
)

# Additional check: ensure that at least one embedding has a norm significantly different from 1
norms = [np.linalg.norm(result) for result in results]
assert any(
not np.isclose(norm, 1.0, atol=0.1) for norm in norms
), "Expected at least one embedding with norm not close to 1.0"
assert any(not np.isclose(norm, 1.0, atol=0.1) for norm in norms), (
"Expected at least one embedding with norm not close to 1.0"
)

def test_encode_batch(self) -> None:
"""
Expand All @@ -149,20 +149,20 @@ def test_encode_batch(self) -> None:
inputs = [f"This is test sentence {i}" for i in range(batch_size)]
results = self.embeddings.encode(inputs=inputs)

assert (
len(results) == batch_size
), f"Expected {batch_size} results, got {len(results)}"
assert len(results) == batch_size, (
f"Expected {batch_size} results, got {len(results)}"
)
for result in results:
assert (
len(result) == 384
), f"Expected embedding dimension 384, got {len(result)}"
assert len(result) == 384, (
f"Expected embedding dimension 384, got {len(result)}"
)

# Test with a large batch to ensure it doesn't cause issues
large_batch = ["Large batch test" for _ in range(100)]
large_results = self.embeddings.encode(inputs=large_batch)
assert (
len(large_results) == 100
), f"Expected 100 results for large batch, got {len(large_results)}"
assert len(large_results) == 100, (
f"Expected 100 results for large batch, got {len(large_results)}"
)

def test_encode_batch_consistency(self) -> None:
"""
Expand All @@ -180,6 +180,6 @@ def test_encode_batch_consistency(self) -> None:
batch_result = self.embeddings.encode([input_text, "Another sentence"])[0]

# Compare the embeddings
assert np.allclose(
single_result, batch_result, atol=1e-5
), "Embeddings are not consistent between single and batch processing"
assert np.allclose(single_result, batch_result, atol=1e-5), (
"Embeddings are not consistent between single and batch processing"
)
Loading