Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

simplify testing infrastructure (especially the AVAILABLE_MODELS dictionary) #1121

Merged
merged 17 commits into from
Feb 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
310 changes: 140 additions & 170 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,138 +1,16 @@
import os
import pathlib
import random
import tempfile
import time
import uuid
import pytest
import requests
import importlib

from guidance import models

# The naming convention for the keys is "<loader>_<model>_<host>" where:
# - 'loader' is 'transformers' or 'llamacpp'
# - 'model' contains relevant information about the model itself
# - 'host' is 'cpu' or 'gpu' as appropriate

AVAILABLE_MODELS = {}

# GEMMA 2
AVAILABLE_MODELS["llamacpp_gemma2_9b_cpu"] = lambda: dict(
# Note that this model requires an appropriate
# HF_TOKEN environment variable
name="huggingface_hubllama:bartowski/gemma-2-9b-it-GGUF:gemma-2-9b-it-IQ2_XS.gguf",
kwargs={"verbose": True, "n_ctx": 4096},
)
AVAILABLE_MODELS["transformers_gemma2_9b_cpu"] = lambda: dict(
# Note that this model requires an appropriate
# HF_TOKEN environment variable
name="transformers:google/gemma-2-9b-it",
kwargs={
"quantization_config": importlib.import_module("transformers").BitsAndBytesConfig(load_in_8bit=True),
},
)
AVAILABLE_MODELS["transformers_gemma2_9b_gpu"] = lambda: dict(
# Note that this model requires an appropriate
# HF_TOKEN environment variable
name="transformers:google/gemma-2-9b-it",
kwargs={
"device_map": "cuda:0",
"quantization_config": importlib.import_module("transformers").BitsAndBytesConfig(load_in_4bit=True),
},
)

# GPT 2
AVAILABLE_MODELS["transformers_gpt2_cpu"] = lambda: dict(name="transformers:gpt2", kwargs=dict())
AVAILABLE_MODELS["transformers_gpt2_gpu"] = lambda: dict(name="transformers:gpt2", kwargs={"device_map": "cuda:0"})

# LLAMA 2
AVAILABLE_MODELS["llamacpp_llama2_7b_cpu"] = lambda: dict(
name="huggingface_hubllama:TheBloke/Llama-2-7B-GGUF:llama-2-7b.Q5_K_M.gguf",
kwargs={"verbose": True, "n_ctx": 4096},
)
AVAILABLE_MODELS["llamacpp_llama2_7b_gpu"] = lambda: dict(
name="huggingface_hubllama:TheBloke/Llama-2-7B-GGUF:llama-2-7b.Q5_K_M.gguf",
kwargs={"verbose": True, "n_gpu_layers": -1, "n_ctx": 4096},
)

# LLAMA 3
AVAILABLE_MODELS["transformers_llama3_8b_cpu"] = lambda: dict(
# Note that this model requires an appropriate
# HF_TOKEN environment variable
name="transformers:meta-llama/Meta-Llama-3-8B-Instruct",
kwargs={"trust_remote_code": True, "torch_dtype": importlib.import_module("torch").bfloat16},
)
AVAILABLE_MODELS["transformers_llama3_8b_gpu"] = lambda: dict(
# Note that this model requires an appropriate
# HF_TOKEN environment variable
name="transformers:meta-llama/Meta-Llama-3-8B-Instruct",
kwargs={"trust_remote_code": True, "torch_dtype": importlib.import_module("torch").bfloat16, "device_map": "cuda:0"},
)

# MISTRAL
AVAILABLE_MODELS["transformers_mistral_7b_cpu"] = lambda: dict(
name="transformers:mistralai/Mistral-7B-v0.1", kwargs=dict()
)
AVAILABLE_MODELS["llamacpp_mistral_7b_cpu"] = lambda: dict(
name="huggingface_hubllama:TheBloke/Mistral-7B-Instruct-v0.2-GGUF:mistral-7b-instruct-v0.2.Q8_0.gguf",
kwargs={"verbose": True, "n_ctx": 2048},
)

# PHI 2
AVAILABLE_MODELS["transformers_phi2_cpu"] = lambda: dict(
name="transformers:microsoft/phi-2", kwargs={"trust_remote_code": True}
)
AVAILABLE_MODELS["transformers_phi2_gpu"] = lambda: dict(
name="transformers:microsoft/phi-2",
kwargs={"trust_remote_code": True, "device_map": "cuda:0"},
)

# PHI 3
AVAILABLE_MODELS["transformers_phi3_mini_4k_instruct_cpu"] = lambda: dict(
name="transformers:microsoft/Phi-3-mini-4k-instruct",
kwargs={"trust_remote_code": True},
)
AVAILABLE_MODELS["llamacpp_phi3_mini_4k_instruct_cpu"] = lambda: dict(
name="huggingface_hubllama:microsoft/Phi-3-mini-4k-instruct-gguf:Phi-3-mini-4k-instruct-q4.gguf",
kwargs={"verbose": True, "n_ctx": 4096},
)
AVAILABLE_MODELS["transformers_phi3_small_8k_instruct_gpu"] = lambda: dict(
name="transformers:microsoft/Phi-3-small-8k-instruct",
kwargs={"trust_remote_code": True, "load_in_8bit": True, "device_map": "cuda:0"},
)

# QWEN2DOT5
AVAILABLE_MODELS["transformers_qwen2dot5_0dot5b_cpu"] = lambda: dict(
name="transformers:Qwen/Qwen2.5-0.5B", kwargs=dict()
)
AVAILABLE_MODELS["transformers_qwen2dot5_0dot5b_gpu"] = lambda: dict(
name="transformers:Qwen/Qwen2.5-0.5B", kwargs={"device_map": "cuda:0"}
)
AVAILABLE_MODELS["transformers_qwen2dot5_0dot5b_instruct_cpu"] = lambda: dict(
name="transformers:Qwen/Qwen2.5-0.5B-Instruct", kwargs=dict()
)
AVAILABLE_MODELS["transformers_qwen2dot5_0dot5b_instruct_gpu"] = lambda: dict(
name="transformers:Qwen/Qwen2.5-0.5B-Instruct", kwargs={"device_map": "cuda:0"}
)


# Ensure that asserts from tests/utils.py are rewritten by pytest to show helpful messages
pytest.register_assert_rewrite("tests.utils")


from .utils import get_model

SELECTED_MODEL_ENV_VARIABLE = "GUIDANCE_SELECTED_MODEL"

def pytest_addoption(parser):
SELECTED_MODEL_ENV_VARIABLE = "GUIDANCE_SELECTED_MODEL"
default_model = os.getenv(SELECTED_MODEL_ENV_VARIABLE, "transformers_gpt2_cpu")
parser.addoption(
"--selected_model",
action="store",
default=default_model,
type=str,
choices=AVAILABLE_MODELS.keys(),
help=f"LLM to load when needed. Set default via environment variable {SELECTED_MODEL_ENV_VARIABLE}",
)

Expand All @@ -143,13 +21,7 @@ def selected_model_name(pytestconfig) -> str:


@pytest.fixture(scope="session")
def selected_model_info(selected_model_name: str):
model_info = AVAILABLE_MODELS[selected_model_name]()
return model_info


@pytest.fixture(scope="module")
def selected_model(selected_model_info: str) -> models.Model:
def selected_model(selected_model_name: str) -> models.Model:
"""Get a concrete model for tests

This fixture is for tests which are supposed
Expand All @@ -161,10 +33,145 @@ def selected_model(selected_model_info: str) -> models.Model:
When running the tests, the model used is
controlled by the '--selected_model' command
line argument to pytest.

The naming convention for the keys is "<loader>_<model>_<host>" where:
- 'loader' is 'transformers' or 'llamacpp'
- 'model' contains relevant information about the model itself
- 'host' is 'cpu' or 'gpu' as appropriate
"""
model = get_model(selected_model_info["name"], **(selected_model_info["kwargs"]))
assert model is not None
return model

# GEMMA 2
if selected_model_name == "llamacpp_gemma2_9b_cpu":
# Note that this model requires an appropriate HF_TOKEN environment variable
from huggingface_hub import hf_hub_download

return models.LlamaCpp(
hf_hub_download(
repo_id="bartowski/gemma-2-9b-it-GGUF", filename="gemma-2-9b-it-IQ2_XS.gguf"
),
verbose=True,
n_ctx=4096,
)
if selected_model_name == "transformers_gemma2_9b_cpu":
# Note that this model requires an appropriate HF_TOKEN environment variable
from transformers import BitsAndBytesConfig

return models.Transformers(
"google/gemma-2-9b-it",
quantization_config=BitsAndBytesConfig(load_in_8bit=True),
)
if selected_model_name == "transformers_gemma2_9b_gpu":
# Note that this model requires an appropriate HF_TOKEN environment variable
from transformers import BitsAndBytesConfig

return models.Transformers(
"google/gemma-2-9b-it",
device_map="cuda:0",
quantization_config=BitsAndBytesConfig(load_in_4bit=True),
)

# GPT 2
if selected_model_name == "transformers_gpt2_cpu":
return models.Transformers("gpt2")
if selected_model_name == "transformers_gpt2_gpu":
return models.Transformers("gpt2", device_map="cuda:0")

# LLAMA 2
if selected_model_name == "llamacpp_llama2_7b_cpu":
from huggingface_hub import hf_hub_download

return models.LlamaCpp(
hf_hub_download(repo_id="TheBloke/Llama-2-7B-GGUF", filename="llama-2-7b.Q5_K_M.gguf"),
verbose=True,
n_ctx=4096,
)
if selected_model_name == "llamacpp_llama2_7b_gpu":
from huggingface_hub import hf_hub_download

return models.LlamaCpp(
hf_hub_download(repo_id="TheBloke/Llama-2-7B-GGUF", filename="llama-2-7b.Q5_K_M.gguf"),
verbose=True,
n_ctx=4096,
n_gpu_layers=-1,
)

# LLAMA 3
if selected_model_name == "transformers_llama3_8b_cpu":
# Note that this model requires an appropriate HF_TOKEN environment variable
from torch import bfloat16

return models.Transformers(
"meta-llama/Meta-Llama-3-8B-Instruct",
trust_remote_code=True,
torch_dtype=bfloat16,
)
if selected_model_name == "transformers_llama3_8b_gpu":
# Note that this model requires an appropriate HF_TOKEN environment variable
from torch import bfloat16

return models.Transformers(
"meta-llama/Meta-Llama-3-8B-Instruct",
trust_remote_code=True,
torch_dtype=bfloat16,
device_map="cuda:0",
)

# MISTRAL
if selected_model_name == "transformers_mistral_7b_cpu":
return models.Transformers("mistralai/Mistral-7B-v0.1")
if selected_model_name == "llamacpp_mistral_7b_cpu":
from huggingface_hub import hf_hub_download

return models.LlamaCpp(
hf_hub_download(
repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
filename="mistral-7b-instruct-v0.2.Q8_0.gguf",
),
verbose=True,
n_ctx=2048,
)

# PHI 2
if selected_model_name == "transformers_phi2_cpu":
return models.Transformers("microsoft/phi-2", trust_remote_code=True)
if selected_model_name == "transformers_phi2_gpu":
return models.Transformers("microsoft/phi-2", trust_remote_code=True, device_map="cuda:0")

# PHI 3
if selected_model_name == "transformers_phi3_mini_4k_instruct_cpu":
return models.Transformers("microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True)
if selected_model_name == "llamacpp_phi3_mini_4k_instruct_cpu":
from huggingface_hub import hf_hub_download

return models.LlamaCpp(
hf_hub_download(
repo_id="microsoft/Phi-3-mini-4k-instruct-gguf",
filename="Phi-3-mini-4k-instruct-q4.gguf",
),
verbose=True,
n_ctx=4096,
)
if selected_model_name == "transformers_phi3_small_8k_instruct_gpu":
return models.Transformers(
"microsoft/Phi-3-small-8k-instruct",
trust_remote_code=True,
load_in_8bit=True,
device_map="cuda:0",
)

# QWEN2DOT5
if selected_model_name == "transformers_qwen2dot5_0dot5b_cpu":
return models.Transformers("Qwen/Qwen2.5-0.5B")
if selected_model_name == "transformers_qwen2dot5_0dot5b_gpu":
return models.Transformers("Qwen/Qwen2.5-0.5B", device_map="cuda:0")
if selected_model_name == "transformers_qwen2dot5_0dot5b_instruct_cpu":
return models.Transformers("Qwen/Qwen2.5-0.5B-Instruct")
if selected_model_name == "transformers_qwen2dot5_0dot5b_instruct_gpu":
return models.Transformers("Qwen/Qwen2.5-0.5B-Instruct", device_map="cuda:0")

raise ValueError(
f"No support for selected_model_name {selected_model_name}"
) # pragma: no cover


@pytest.fixture(scope="module")
Expand All @@ -179,40 +186,3 @@ def llamacpp_model(selected_model, selected_model_name):
return selected_model
else:
pytest.skip("Requires Llama-Cpp model")


@pytest.fixture(scope="function")
def rate_limiter() -> int:
"""Limit test execution rate

Any test using this fixture will have a
random delay inserted before the test runs.
It can be used as a crude rate limiter for
tests which call external APIs
"""
delay_secs = random.randint(10, 30)
time.sleep(delay_secs)
return delay_secs


@pytest.fixture(scope="session")
def remote_image_url():
return "https://picsum.photos/300/200"


@pytest.fixture(scope="session")
def local_image_path(remote_image_url):
with tempfile.TemporaryDirectory() as temp_dir:
td = pathlib.Path(temp_dir)
filename = f"{str(uuid.uuid4())}.jpg"
with open(td / filename, "wb") as file:
response = requests.get(remote_image_url)
file.write(response.content)
assert (td / filename).exists()
yield td / filename


@pytest.fixture(scope="session")
def local_image_bytes(local_image_path):
with open(local_image_path, "rb") as f:
return f.read()
12 changes: 6 additions & 6 deletions tests/model_specific/llama_cpp/test_llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,47 +124,47 @@ def test_subtoken_forced(llamacpp_model: guidance.models.Model):
assert str(lm) == "How much is 2 + 2? ("


def test_llama_cpp_almost_one_batch(llamacpp_model, selected_model_info):
def test_llama_cpp_almost_one_batch(llamacpp_model):
lm = llamacpp_model
batch_size = lm.engine.model_obj.n_batch
long_str = lm.engine.tokenizer.bos_token.decode("utf-8") * (batch_size - 1)
lm += long_str + gen(max_tokens=10)
assert len(str(lm)) > len(long_str)


def test_llama_cpp_exactly_one_batch(llamacpp_model, selected_model_info):
def test_llama_cpp_exactly_one_batch(llamacpp_model):
lm = llamacpp_model
batch_size = lm.engine.model_obj.n_batch
long_str = lm.engine.tokenizer.bos_token.decode("utf-8") * batch_size
lm += long_str + gen(max_tokens=10)
assert len(str(lm)) > len(long_str)


def test_llama_cpp_more_than_one_batch(llamacpp_model, selected_model_info):
def test_llama_cpp_more_than_one_batch(llamacpp_model):
lm = llamacpp_model
batch_size = lm.engine.model_obj.n_batch
long_str = lm.engine.tokenizer.bos_token.decode("utf-8") * (batch_size + 1)
lm += long_str + gen(max_tokens=10)
assert len(str(lm)) > len(long_str)


def test_llama_cpp_almost_two_batches(llamacpp_model, selected_model_info):
def test_llama_cpp_almost_two_batches(llamacpp_model):
lm = llamacpp_model
batch_size = lm.engine.model_obj.n_batch
long_str = lm.engine.tokenizer.bos_token.decode("utf-8") * ((2 * batch_size) - 1)
lm += long_str + gen(max_tokens=10)
assert len(str(lm)) > len(long_str)


def test_llama_cpp_two_batches(llamacpp_model, selected_model_info):
def test_llama_cpp_two_batches(llamacpp_model):
lm = llamacpp_model
batch_size = lm.engine.model_obj.n_batch
long_str = lm.engine.tokenizer.bos_token.decode("utf-8") * (2 * batch_size)
lm += long_str + gen(max_tokens=10)
assert len(str(lm)) > len(long_str)


def test_llama_cpp_more_than_two_batches(llamacpp_model, selected_model_info):
def test_llama_cpp_more_than_two_batches(llamacpp_model):
lm = llamacpp_model
batch_size = lm.engine.model_obj.n_batch
long_str = lm.engine.tokenizer.bos_token.decode("utf-8") * ((2 * batch_size) + 1)
Expand Down
Loading