Skip to content

Commit 8c4c1d0

Browse files
authored
Merge pull request #90 from ggerganov/master
b2254
2 parents 525213d + 9e359a4 commit 8c4c1d0

17 files changed

+1379
-162
lines changed

examples/quantize/quantize.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
2727
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
2828
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
2929
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
30+
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
31+
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
3032
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
3133
{ "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization" , },
3234
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },

examples/server/server.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -1836,7 +1836,7 @@ struct llama_server_context
18361836
send_embedding(slot);
18371837
slot.release();
18381838
slot.i_batch = -1;
1839-
return true;
1839+
continue;
18401840
}
18411841

18421842
completion_token_output result;
+1-33
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,4 @@
11
# List of ongoing issues
22
@bug
33
Feature: Issues
4-
# Issue #5655
5-
Scenario: Multi users embeddings
6-
Given a server listening on localhost:8080
7-
And a model file stories260K.gguf
8-
And a model alias tinyllama-2
9-
And 42 as server seed
10-
And 64 KV cache size
11-
And 2 slots
12-
And continuous batching
13-
And embeddings extraction
14-
Then the server is starting
15-
Then the server is healthy
16-
17-
Given a prompt:
18-
"""
19-
Write a very long story about AI.
20-
"""
21-
And a prompt:
22-
"""
23-
Write another very long music lyrics.
24-
"""
25-
And a prompt:
26-
"""
27-
Write a very long poem.
28-
"""
29-
And a prompt:
30-
"""
31-
Write a very long joke.
32-
"""
33-
Given concurrent embedding requests
34-
Then the server is busy
35-
Then the server is idle
36-
Then all embeddings are generated
4+
# No confirmed issue at the moment

examples/server/tests/features/parallel.feature

+46
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ Feature: Parallel
88
And 42 as server seed
99
And 64 KV cache size
1010
And 2 slots
11+
And embeddings extraction
1112
And continuous batching
1213
Then the server is starting
1314
Then the server is healthy
@@ -75,3 +76,48 @@ Feature: Parallel
7576
Then the server is busy
7677
Then the server is idle
7778
Then all prompts are predicted
79+
80+
Scenario: Multi users embeddings
81+
Given a prompt:
82+
"""
83+
Write a very long story about AI.
84+
"""
85+
And a prompt:
86+
"""
87+
Write another very long music lyrics.
88+
"""
89+
And a prompt:
90+
"""
91+
Write a very long poem.
92+
"""
93+
And a prompt:
94+
"""
95+
Write a very long joke.
96+
"""
97+
Given concurrent embedding requests
98+
Then the server is busy
99+
Then the server is idle
100+
Then all embeddings are generated
101+
102+
Scenario: Multi users OAI compatibility embeddings
103+
Given a prompt:
104+
"""
105+
In which country Paris is located ?
106+
"""
107+
And a prompt:
108+
"""
109+
Is Madrid the capital of Spain ?
110+
"""
111+
And a prompt:
112+
"""
113+
What is the biggest US city ?
114+
"""
115+
And a prompt:
116+
"""
117+
What is the capital of Bulgaria ?
118+
"""
119+
And a model tinyllama-2
120+
Given concurrent OAI embedding requests
121+
Then the server is busy
122+
Then the server is idle
123+
Then all embeddings are generated

examples/server/tests/features/server.feature

+13
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,19 @@ Feature: llama.cpp server
6060
"""
6161
Then embeddings are generated
6262

63+
Scenario: OAI Embeddings compatibility with multiple inputs
64+
Given a model tinyllama-2
65+
Given a prompt:
66+
"""
67+
In which country Paris is located ?
68+
"""
69+
And a prompt:
70+
"""
71+
Is Madrid the capital of Spain ?
72+
"""
73+
When an OAI compatible embeddings computation request for multiple inputs
74+
Then embeddings are generated
75+
6376

6477
Scenario: Tokenize / Detokenize
6578
When tokenizing:

examples/server/tests/features/steps/steps.py

+107-44
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import asyncio
2+
import collections
23
import json
34
import os
45
import re
@@ -261,35 +262,35 @@ def step_a_prompt_prompt(context, prompt):
261262
@step(u'concurrent completion requests')
262263
@async_run_until_complete()
263264
async def step_concurrent_completion_requests(context):
264-
await concurrent_completion_requests(context,
265-
request_completion,
266-
# prompt is inserted automatically
267-
context.base_url,
268-
debug=context.debug,
269-
n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
270-
server_seed=context.server_seed if hasattr(context, 'server_seed') else None,
271-
user_api_key=context.user_api_key if hasattr(context,
272-
'user_api_key') else None)
265+
await concurrent_requests(context,
266+
request_completion,
267+
# prompt is inserted automatically
268+
context.base_url,
269+
debug=context.debug,
270+
n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
271+
server_seed=context.server_seed if hasattr(context, 'server_seed') else None,
272+
user_api_key=context.user_api_key if hasattr(context,
273+
'user_api_key') else None)
273274

274275

275276
@step(u'concurrent OAI completions requests')
276277
@async_run_until_complete
277278
async def step_oai_chat_completions(context):
278-
await concurrent_completion_requests(context, oai_chat_completions,
279-
# user_prompt is inserted automatically
280-
context.system_prompt,
281-
context.base_url,
282-
True, # async_client
283-
model=context.model
284-
if hasattr(context, 'model') else None,
285-
n_predict=context.n_predict
286-
if hasattr(context, 'n_predict') else None,
287-
enable_streaming=context.enable_streaming
288-
if hasattr(context, 'enable_streaming') else None,
289-
server_seed=context.server_seed
290-
if hasattr(context, 'server_seed') else None,
291-
user_api_key=context.user_api_key
292-
if hasattr(context, 'user_api_key') else None)
279+
await concurrent_requests(context, oai_chat_completions,
280+
# user_prompt is inserted automatically
281+
context.system_prompt,
282+
context.base_url,
283+
True, # async_client
284+
model=context.model
285+
if hasattr(context, 'model') else None,
286+
n_predict=context.n_predict
287+
if hasattr(context, 'n_predict') else None,
288+
enable_streaming=context.enable_streaming
289+
if hasattr(context, 'enable_streaming') else None,
290+
server_seed=context.server_seed
291+
if hasattr(context, 'server_seed') else None,
292+
user_api_key=context.user_api_key
293+
if hasattr(context, 'user_api_key') else None)
293294

294295

295296
@step(u'all prompts are predicted')
@@ -316,36 +317,58 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None):
316317
@step(u'embeddings are computed for')
317318
@async_run_until_complete
318319
async def step_compute_embedding(context):
319-
content = context.text
320-
base_url = context.base_url
321-
context.embeddings = await request_embedding(content, base_url)
320+
context.embeddings = await request_embedding(context.text, base_url=context.base_url)
322321

323322

324323
@step(u'embeddings are generated')
325324
def step_assert_embeddings(context):
326-
assert_embeddings(context.embeddings)
325+
if len(context.prompts) == 0:
326+
assert_embeddings(context.embeddings)
327+
else:
328+
assert len(context.embeddings) == len(context.prompts), (f"unexpected response:\n"
329+
f"context.prompts={context.prompts}\n"
330+
f"context.embeddings={context.embeddings}")
331+
for embedding in context.embeddings:
332+
context.prompts.pop()
333+
assert_embeddings(embedding)
327334

328335

329336
@step(u'an OAI compatible embeddings computation request for')
330-
def step_oai_compute_embedding(context):
331-
openai.api_key = 'nope' # openai client always expects an api_keu
332-
if context.user_api_key is not None:
333-
openai.api_key = context.user_api_key
334-
openai.api_base = f'{context.base_url}/v1'
335-
embeddings = openai.Embedding.create(
336-
model=context.model,
337-
input=context.text,
338-
)
339-
context.embeddings = embeddings
337+
@async_run_until_complete
338+
async def step_oai_compute_embeddings(context):
339+
context.embeddings = await request_oai_embeddings(context.text,
340+
base_url=context.base_url,
341+
user_api_key=context.user_api_key,
342+
model=context.model)
343+
344+
345+
@step(u'an OAI compatible embeddings computation request for multiple inputs')
346+
@async_run_until_complete
347+
async def step_oai_compute_embeddings_multiple_inputs(context):
348+
context.embeddings = await request_oai_embeddings(context.prompts,
349+
base_url=context.base_url,
350+
user_api_key=context.user_api_key,
351+
model=context.model)
340352

341353

342354
@step(u'concurrent embedding requests')
343355
@async_run_until_complete()
344356
async def step_concurrent_embedding_requests(context):
345-
await concurrent_completion_requests(context,
346-
request_embedding,
347-
# prompt is inserted automatically
348-
context.base_url)
357+
await concurrent_requests(context,
358+
request_embedding,
359+
# prompt is inserted automatically
360+
base_url=context.base_url)
361+
362+
363+
@step(u'concurrent OAI embedding requests')
364+
@async_run_until_complete()
365+
async def step_concurrent_oai_embedding_requests(context):
366+
await concurrent_requests(context,
367+
request_oai_embeddings,
368+
# prompt is inserted automatically
369+
base_url=context.base_url,
370+
async_client=True,
371+
model=context.model)
349372

350373

351374
@step(u'all embeddings are generated')
@@ -401,7 +424,7 @@ def step_check_options_header_value(context, cors_header, cors_header_value):
401424
assert context.options_response.headers[cors_header] == cors_header_value
402425

403426

404-
async def concurrent_completion_requests(context, f_completion, *args, **kwargs):
427+
async def concurrent_requests(context, f_completion, *args, **kwargs):
405428
n_prompts = len(context.prompts)
406429
if context.debug:
407430
print(f"starting {n_prompts} concurrent completion requests...")
@@ -565,7 +588,7 @@ async def oai_chat_completions(user_prompt,
565588
return completion_response
566589

567590

568-
async def request_embedding(content, base_url):
591+
async def request_embedding(content, base_url=None):
569592
async with aiohttp.ClientSession() as session:
570593
async with session.post(f'{base_url}/embedding',
571594
json={
@@ -576,6 +599,46 @@ async def request_embedding(content, base_url):
576599
return response_json['embedding']
577600

578601

602+
async def request_oai_embeddings(input,
603+
base_url=None, user_api_key=None,
604+
model=None, async_client=False):
605+
# openai client always expects an api_key
606+
user_api_key = user_api_key if user_api_key is not None else 'nope'
607+
if async_client:
608+
origin = 'llama.cpp'
609+
if user_api_key is not None:
610+
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
611+
async with aiohttp.ClientSession() as session:
612+
async with session.post(f'{base_url}/v1/embeddings',
613+
json={
614+
"input": input,
615+
"model": model,
616+
},
617+
headers=headers) as response:
618+
assert response.status == 200, f"received status code not expected: {response.status}"
619+
assert response.headers['Access-Control-Allow-Origin'] == origin
620+
assert response.headers['Content-Type'] == "application/json; charset=utf-8"
621+
response_json = await response.json()
622+
assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
623+
assert response_json['object'] == 'list'
624+
return response_json['data']
625+
else:
626+
openai.api_key = user_api_key
627+
openai.api_base = f'{base_url}/v1'
628+
oai_embeddings = openai.Embedding.create(
629+
model=model,
630+
input=input,
631+
)
632+
633+
if isinstance(input, collections.abc.Sequence):
634+
embeddings = []
635+
for an_oai_embeddings in oai_embeddings.data:
636+
embeddings.append(an_oai_embeddings.embedding)
637+
else:
638+
embeddings = oai_embeddings.data.embedding
639+
return embeddings
640+
641+
579642
def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None):
580643
content = completion_response['content']
581644
n_predicted = completion_response['timings']['predicted_n']

0 commit comments

Comments
 (0)