Spaces:

DesertWolf
/

test3

Paused

App Files Files Community

test3 / tests /local_testing /test_azure_perf.py

DesertWolf

Upload folder using huggingface_hub

447ebeb verified 6 months ago

raw

history blame contribute delete

4.28 kB

	# #### What this tests ####
	# # This adds perf testing to the router, to ensure it's never > 50ms slower than the azure-openai sdk.
	# import sys, os, time, inspect, asyncio, traceback
	# from datetime import datetime
	# import pytest

	# sys.path.insert(0, os.path.abspath("../.."))
	# import openai, litellm, uuid
	# from openai import AsyncAzureOpenAI

	# client = AsyncAzureOpenAI(
	# api_key=os.getenv("AZURE_API_KEY"),
	# azure_endpoint=os.getenv("AZURE_API_BASE"), # type: ignore
	# api_version=os.getenv("AZURE_API_VERSION"),
	# )

	# model_list = [
	# {
	# "model_name": "azure-test",
	# "litellm_params": {
	# "model": "azure/chatgpt-v-3",
	# "api_key": os.getenv("AZURE_API_KEY"),
	# "api_base": os.getenv("AZURE_API_BASE"),
	# "api_version": os.getenv("AZURE_API_VERSION"),
	# },
	# }
	# ]

	# router = litellm.Router(model_list=model_list) # type: ignore


	# async def _openai_completion():
	# try:
	# start_time = time.time()
	# response = await client.chat.completions.create(
	# model="chatgpt-v-3",
	# messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
	# stream=True,
	# )
	# time_to_first_token = None
	# first_token_ts = None
	# init_chunk = None
	# async for chunk in response:
	# if (
	# time_to_first_token is None
	# and len(chunk.choices) > 0
	# and chunk.choices[0].delta.content is not None
	# ):
	# first_token_ts = time.time()
	# time_to_first_token = first_token_ts - start_time
	# init_chunk = chunk
	# end_time = time.time()
	# print(
	# "OpenAI Call: ",
	# init_chunk,
	# start_time,
	# first_token_ts,
	# time_to_first_token,
	# end_time,
	# )
	# return time_to_first_token
	# except Exception as e:
	# print(e)
	# return None


	# async def _router_completion():
	# try:
	# start_time = time.time()
	# response = await router.acompletion(
	# model="azure-test",
	# messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
	# stream=True,
	# )
	# time_to_first_token = None
	# first_token_ts = None
	# init_chunk = None
	# async for chunk in response:
	# if (
	# time_to_first_token is None
	# and len(chunk.choices) > 0
	# and chunk.choices[0].delta.content is not None
	# ):
	# first_token_ts = time.time()
	# time_to_first_token = first_token_ts - start_time
	# init_chunk = chunk
	# end_time = time.time()
	# print(
	# "Router Call: ",
	# init_chunk,
	# start_time,
	# first_token_ts,
	# time_to_first_token,
	# end_time - first_token_ts,
	# )
	# return time_to_first_token
	# except Exception as e:
	# print(e)
	# return None


	# async def test_azure_completion_streaming():
	# """
	# Test azure streaming call - measure on time to first (non-null) token.
	# """
	# n = 3 # Number of concurrent tasks
	# ## OPENAI AVG. TIME
	# tasks = [_openai_completion() for _ in range(n)]
	# chat_completions = await asyncio.gather(*tasks)
	# successful_completions = [c for c in chat_completions if c is not None]
	# total_time = 0
	# for item in successful_completions:
	# total_time += item
	# avg_openai_time = total_time / 3
	# ## ROUTER AVG. TIME
	# tasks = [_router_completion() for _ in range(n)]
	# chat_completions = await asyncio.gather(*tasks)
	# successful_completions = [c for c in chat_completions if c is not None]
	# total_time = 0
	# for item in successful_completions:
	# total_time += item
	# avg_router_time = total_time / 3
	# ## COMPARE
	# print(f"avg_router_time: {avg_router_time}; avg_openai_time: {avg_openai_time}")
	# assert avg_router_time < avg_openai_time + 0.5


	# # asyncio.run(test_azure_completion_streaming())