Litellm 1 66 0 (#94)

phact · web-flow · commit 3c10c0368d4b · 2025-04-19T09:17:15.000-04:00
* bump litellm

* client v2.5.4

* vector_stores no longer in beta

* comment out response_format=json_object from tests

* comment out response_format=json_object from tests

* embedding to_dict

* client v2.5.5

* Update inference_utils.py
diff --git a/client/astra_assistants/patch.py b/client/astra_assistants/patch.py
@@ -403,7 +403,10 @@ def patched_create(self, *args, **kwargs):
             ):
                 # TODO figure out how to get the model from the tool resources
                 vector_store_id = assistant.tool_resources.file_search.vector_store_ids[0]
-                file_list_paginator = client.beta.vector_stores.files.list(vector_store_id=vector_store_id)
+                try:
+                    file_list_paginator = client.beta.vector_stores.files.list(vector_store_id=vector_store_id)
+                except Exception as e:
+                    file_list_paginator = client.vector_stores.files.list(vector_store_id=vector_store_id)
                 vs_file = async_helper.run_async(fetch_first_page(file_list_paginator))
                 if vs_file is not None:
                     # use the first file
@@ -453,7 +456,10 @@ def patched_create(self, *args, **kwargs):
             ):
                 # TODO figure out how to get the model from the tool resources
                 vector_store_id = assistant.tool_resources.file_search.vector_store_ids[0]
-                vs_files = client.beta.vector_stores.files.list(vector_store_id=vector_store_id).data
+                try:
+                    vs_files = client.beta.vector_stores.files.list(vector_store_id=vector_store_id).data
+                except Exception as e:
+                    vs_files = client.vector_stores.files.list(vector_store_id=vector_store_id).data
                 if len(vs_files) > 0:
                     # use the first file
                     vs_file: VectorStoreFile= vs_files[0]
@@ -503,7 +509,10 @@ async def patched_create(self, *args, **kwargs):
             ):
                 # TODO figure out how to get the model from the tool resources
                 vector_store_id = assistant.tool_resources.file_search.vector_store_ids[0]
-                vs_files = await client.beta.vector_stores.files.list(vector_store_id=vector_store_id).data
+                try:
+                    vs_files = await client.beta.vector_stores.files.list(vector_store_id=vector_store_id).data
+                except Exception as e:
+                    vs_files = await client.vector_stores.files.list(vector_store_id=vector_store_id).data
                 if len(vs_files) > 0:
                     # use the first file
                     vs_file: VectorStoreFile= vs_files[0]
diff --git a/client/pyproject.toml b/client/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "astra-assistants"
-version = "2.5.4"
+version = "2.5.5"
 description = "Astra Assistants API - drop in replacement for OpenAI Assistants, powered by AstraDB"
 authors = ["phact <estevezsebastian@gmail.com>"]
 readme = "README.md"
diff --git a/client/tests/astra-assistants/test_chat_completion.py b/client/tests/astra-assistants/test_chat_completion.py
@@ -42,7 +42,7 @@ def print_chat_completion(model, client):
             {"role": "system", "content": "You are an amazing json generator."},
             {"role": "user", "content": prompt}
         ],
-        response_format={"type": "json_object"},
+        #response_format={"type": "json_object"},
     )
 
     logger.info(f'prompt> {prompt}')
diff --git a/client/tests/astra-assistants/test_run_retreival_v2.py b/client/tests/astra-assistants/test_run_retreival_v2.py
@@ -32,17 +32,17 @@ def run_with_assistant(assistant, client, file_path, embedding_model):
         pass
 
 
-    vector_store = client.beta.vector_stores.create(
+    vector_store = client.vector_stores.create(
         name="papers",
         file_ids=[file.id]
     )
 
-    vs_list = client.beta.vector_stores.list()
+    vs_list = client.vector_stores.list()
 
     assert len(vs_list.data) > 0, "vector store list is empty"
 
     # TODO support  vector store file creation
-    #file = client.beta.vector_stores.files.create_and_poll(
+    #file = client.vector_stores.files.create_and_poll(
     #    vector_store_id=vector_store.id,
     #    file_id=file2.id
     #)
@@ -54,7 +54,7 @@ def run_with_assistant(assistant, client, file_path, embedding_model):
 
     # Use the upload and poll SDK helper to upload the files, add them to the vector store,
     # and poll the status of the file batch for completion.
-    #file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
+    #file_batch = client.vector_stores.file_batches.upload_and_poll(
     #    vector_store_id=vector_store.id, files=file_streams
     #)
 
@@ -116,10 +116,10 @@ def run_with_assistant(assistant, client, file_path, embedding_model):
     response = client.beta.threads.messages.list(thread_id=thread.id)
     logger.info(response.data[0].content[0].text.value)
 
-    vs_files = client.beta.vector_stores.files.list(vector_store_id=vector_store.id)
+    vs_files = client.vector_stores.files.list(vector_store_id=vector_store.id)
     for vsf in vs_files.data:
-        client.beta.vector_stores.files.delete(file_id=vsf.id, vector_store_id=vector_store.id)
-    client.beta.vector_stores.delete(vector_store.id)
+        client.vector_stores.files.delete(file_id=vsf.id, vector_store_id=vector_store.id)
+    client.vector_stores.delete(vector_store.id)
 
 
 
diff --git a/client/tests/astra-assistants/test_streaming_run_retrieval_async_v2.py b/client/tests/astra-assistants/test_streaming_run_retrieval_async_v2.py
@@ -23,13 +23,13 @@ async def run_with_assistant(assistant, client):
         purpose="assistants",
     )
 
-    vector_store = await client.beta.vector_stores.create(
+    vector_store = await client.vector_stores.create(
         name="papers",
         file_ids=[file.id]
     )
 
     # TODO support  vector store file creation
-    #file = await client.beta.vector_stores.files.create_and_poll(
+    #file = await client.vector_stores.files.create_and_poll(
     #    vector_store_id=vector_store.id,
     #    file_id=file2.id
     #)
@@ -41,7 +41,7 @@ async def run_with_assistant(assistant, client):
 
     # Use the upload and poll SDK helper to upload the files, add them to the vector store,
     # and poll the status of the file batch for completion.
-    #file_batch = await client.beta.vector_stores.file_batches.upload_and_poll(
+    #file_batch = await client.vector_stores.file_batches.upload_and_poll(
     #    vector_store_id=vector_store.id, files=file_streams
     #)
 
diff --git a/client/tests/astra-assistants/test_streaming_run_retrieval_v2.py b/client/tests/astra-assistants/test_streaming_run_retrieval_v2.py
@@ -19,13 +19,13 @@ def run_with_assistant(assistant, client):
         purpose="assistants",
     )
 
-    vector_store = client.beta.vector_stores.create(
+    vector_store = client.vector_stores.create(
         name="papers",
         file_ids=[file.id]
     )
 
     # TODO support  vector store file creation
-    #file = client.beta.vector_stores.files.create_and_poll(
+    #file = client.vector_stores.files.create_and_poll(
     #    vector_store_id=vector_store.id,
     #    file_id=file2.id
     #)
@@ -37,7 +37,7 @@ def run_with_assistant(assistant, client):
 
     # Use the upload and poll SDK helper to upload the files, add them to the vector store,
     # and poll the status of the file batch for completion.
-    #file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
+    #file_batch = client.vector_stores.file_batches.upload_and_poll(
     #    vector_store_id=vector_store.id, files=file_streams
     #)
 
diff --git a/client/tests/openai-sdk/test_assistants_v2.py b/client/tests/openai-sdk/test_assistants_v2.py
@@ -47,7 +47,7 @@ def test_assistants_crud(openai_client):
         assert asst.response_format == response_format
         assert len(asst.tool_resources.file_search.vector_store_ids[0]) > 0
 
-        vs = openai_client.beta.vector_stores.retrieve(asst.tool_resources.file_search.vector_store_ids[0])
+        vs = openai_client.vector_stores.retrieve(asst.tool_resources.file_search.vector_store_ids[0])
         assert vs.id == asst.tool_resources.file_search.vector_store_ids[0]
 
         assert asst.name == "Math Tutor"
diff --git a/impl/routes/stateless.py b/impl/routes/stateless.py
@@ -323,6 +323,8 @@ async def create_embedding(
 
     data = []
     for datum in embedding_response.data:
+        if hasattr(datum, "to_dict"):
+            datum = datum.to_dict()
         embedding = Embedding(**datum)
         data.append(embedding)
 
diff --git a/impl/services/inference_utils.py b/impl/services/inference_utils.py
@@ -83,6 +83,9 @@ def get_embeddings(
         return [result["embedding"] for result in response.data]
 
 
+import asyncio
+from litellm.exceptions import RateLimitError
+
 async def get_async_chat_completion_response(
         messages: List[Dict[str, Any]],
         model: Optional[str] = None,
@@ -116,13 +119,24 @@ async def get_async_chat_completion_response(
                 else:
                     litellm_kwargs[key] = type_hints[key](value)
 
-        completion = await acompletion(
-            model=model,
-            messages=messages,
-            deployment_id=deployment_id,
-            **litellm_kwargs
-        )
-        return completion
+            max_retries = 5
+        for attempt in range(max_retries):
+            try:
+                # Your existing logic to get the response
+                completion = await acompletion(
+                    model=model,
+                    messages=messages,
+                    deployment_id=deployment_id,
+                    **litellm_kwargs
+                )
+                return completion
+            except RateLimitError as e:
+                if attempt < max_retries - 1:
+                    backoff_time = 2 ** attempt  # Exponential backoff
+                    await asyncio.sleep(backoff_time)
+                else:
+                    raise HTTPException(status_code=429, detail=f"Rate limit exceeded: {e}")
+        
     except Exception as e:
         if "LLM Provider NOT provided" in e.args[0]:
             logger.error(f"Error: error {model} is not currently supported")
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -50,7 +50,7 @@ tiktoken = "^0.7.0"
 docx2txt = "^0.8"
 pypdf2 = "^3.0.1"
 python-pptx = "^0.6.23"
-litellm = "1.42.5"
+litellm = "1.66.0"
 boto3 = "^1.29.6"
 prometheus-fastapi-instrumentator = "^6.1.0"
 google-cloud-aiplatform = "^1.38.0"
diff --git a/tests/http/test_chat_api.py b/tests/http/test_chat_api.py
@@ -41,7 +41,7 @@ def test_create_chat_completion(client: TestClient):
         "n":2,
         "top_p":1,
         "frequency_penalty":-1.6796687238155954,
-        "response_format":{"type":"json_object"},
+        #"response_format":{"type":"json_object"},
         "stream":False,
         "temperature":1,
         "messages":[

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ def print_chat_completion(model, client):`
`42`	`42`	`{"role": "system", "content": "You are an amazing json generator."},`
`43`	`43`	`{"role": "user", "content": prompt}`
`44`	`44`	`],`
`45`		`- response_format={"type": "json_object"},`
	`45`	`+ #response_format={"type": "json_object"},`
`46`	`46`	`)`
`47`	`47`
`48`	`48`	`logger.info(f'prompt> {prompt}')`