Spaces:

ka1kuk
/

litellm

Runtime error

App Files Files Community

ka1kuk commited on Jan 16, 2024

Commit

6cb06ab

verified ·

1 Parent(s): 9d727ab

Update main.py

Browse files

Files changed (1) hide show

main.py +63 -344

main.py CHANGED Viewed

@@ -1,8 +1,37 @@
-import click
-import subprocess, traceback, json
-import os, sys
 import random
-import importlib
 def run_ollama_serve():
     try:
@@ -15,353 +44,43 @@ def run_ollama_serve():
             f"""
             LiteLLM Warning: proxy started with `ollama` model\n`ollama serve` failed with Exception{e}. \nEnsure you run `ollama serve`
         """
-        )  # noqa
 def is_port_in_use(port):
     import socket
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
         return s.connect_ex(("localhost", port)) == 0
-def run_server(
-    host = "0.0.0.0",
-    port = 8000,
-    api_base = None,
-    api_version = "2023-07-01-preview",
-    model = None,
-    alias = None,
-    add_key = None,
-    headers = None,
-    save = False,
-    debug = False,
-    detailed_debug = False,
-    temperature = 0.0,
-    max_tokens = 1000,
-    request_timeout = 10,
-    drop_params = True,
-    add_function_to_prompt = True,
-    config = None,
-    max_budget = 100,
-    telemetry = False,
-    test = False,
-    local = False,
-    num_workers = 1,
-    test_async = False,
-    num_requests = 1,
-    use_queue = False,
-    health = False,
-    version = False,
-):
-    global feature_telemetry
-    args = locals()
-    if local:
-        from proxy_server import app, save_worker_config, usage_telemetry
-    else:
-        try:
-            from .litellm.proxy.proxy_server import app, save_worker_config, usage_telemetry
-        except ImportError as e:
-            if "litellm[proxy]" in str(e):
-                # user is missing a proxy dependency, ask them to pip install litellm[proxy]
-                raise e
-            else:
-                # this is just a local/relative import error, user git cloned litellm
-                from proxy_server import app, save_worker_config, usage_telemetry
-    feature_telemetry = usage_telemetry
-    if version == True:
-        pkg_version = importlib.metadata.version("litellm")
-        click.echo(f"\nLiteLLM: Current Version = {pkg_version}\n")
-        return
-    if model and "ollama" in model and api_base is None:
         run_ollama_serve()
-    if test_async is True:
-        import requests, concurrent, time
-        api_base = f"http://{host}:{port}"
-        def _make_openai_completion():
-            data = {
-                "model": "gpt-3.5-turbo",
-                "messages": [
-                    {"role": "user", "content": "Write a short poem about the moon"}
-                ],
-            }
-            response = requests.post("http://0.0.0.0:8000/queue/request", json=data)
-            response = response.json()
-            while True:
-                try:
-                    url = response["url"]
-                    polling_url = f"{api_base}{url}"
-                    polling_response = requests.get(polling_url)
-                    polling_response = polling_response.json()
-                    print("\n RESPONSE FROM POLLING JOB", polling_response)
-                    status = polling_response["status"]
-                    if status == "finished":
-                        llm_response = polling_response["result"]
-                        break
-                    print(
-                        f"POLLING JOB{polling_url}\nSTATUS: {status}, \n Response {polling_response}"
-                    )  # noqa
-                    time.sleep(0.5)
-                except Exception as e:
-                    print("got exception in polling", e)
-                    break
-        # Number of concurrent calls (you can adjust this)
-        concurrent_calls = num_requests
-        # List to store the futures of concurrent calls
-        futures = []
-        start_time = time.time()
-        # Make concurrent calls
-        with concurrent.futures.ThreadPoolExecutor(
-            max_workers=concurrent_calls
-        ) as executor:
-            for _ in range(concurrent_calls):
-                futures.append(executor.submit(_make_openai_completion))
-        # Wait for all futures to complete
-        concurrent.futures.wait(futures)
-        # Summarize the results
-        successful_calls = 0
-        failed_calls = 0
-        for future in futures:
-            if future.done():
-                if future.result() is not None:
-                    successful_calls += 1
-                else:
-                    failed_calls += 1
-        end_time = time.time()
-        print(f"Elapsed Time: {end_time-start_time}")
-        print(f"Load test Summary:")
-        print(f"Total Requests: {concurrent_calls}")
-        print(f"Successful Calls: {successful_calls}")
-        print(f"Failed Calls: {failed_calls}")
-        return
-    if health != False:
-        import requests
-        print("\nLiteLLM: Health Testing models in config")
-        response = requests.get(url=f"http://{host}:{port}/health")
-        print(json.dumps(response.json(), indent=4))
-        return
-    if test != False:
-        request_model = model or "gpt-3.5-turbo"
-        click.echo(
-            f"\nLiteLLM: Making a test ChatCompletions request to your proxy. Model={request_model}"
-        )
-        import openai
-        if test == True:  # flag value set
-            api_base = f"http://{host}:{port}"
-        else:
-            api_base = test
-        client = openai.OpenAI(api_key="My API Key", base_url=api_base)
-        response = client.chat.completions.create(
-            model=request_model,
-            messages=[
-                {
-                    "role": "user",
-                    "content": "this is a test request, write a short poem",
-                }
-            ],
-            max_tokens=256,
-        )
-        click.echo(f"\nLiteLLM: response from proxy {response}")
-        print(
-            f"\n LiteLLM: Making a test ChatCompletions + streaming request to proxy. Model={request_model}"
-        )
-        response = client.chat.completions.create(
-            model=request_model,
-            messages=[
-                {
-                    "role": "user",
-                    "content": "this is a test request, write a short poem",
-                }
-            ],
-            stream=True,
-        )
-        for chunk in response:
-            click.echo(f"LiteLLM: streaming response from proxy {chunk}")
-        print("\n making completion request to proxy")
-        response = client.completions.create(
-            model=request_model, prompt="this is a test request, write a short poem"
-        )
-        print(response)
-        return
-    else:
-        if headers:
-            headers = json.loads(headers)
-        save_worker_config(
-            model=model,
-            alias=alias,
-            api_base=api_base,
-            api_version=api_version,
-            debug=debug,
-            detailed_debug=detailed_debug,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            request_timeout=request_timeout,
-            max_budget=max_budget,
-            telemetry=telemetry,
-            drop_params=drop_params,
-            add_function_to_prompt=add_function_to_prompt,
-            headers=headers,
-            save=save,
-            config=config,
-            use_queue=use_queue,
-        )
-        try:
-            import uvicorn
-            if os.name == "nt":
-                pass
-            else:
-                import gunicorn.app.base
-        except:
-            raise ImportError(
-                "Uvicorn, gunicorn needs to be imported. Run - `pip 'litellm[proxy]'`"
-            )
-        if config is not None:
-            """
-            Allow user to pass in db url via config
-            read from there and save it to os.env['DATABASE_URL']
-            """
-            try:
-                import yaml
-            except:
-                raise ImportError(
-                    "yaml needs to be imported. Run - `pip install 'litellm[proxy]'`"
-                )
-            if os.path.exists(config):
-                with open(config, "r") as config_file:
-                    config = yaml.safe_load(config_file)
-            general_settings = config.get("general_settings", {})
-            database_url = general_settings.get("database_url", None)
-            if database_url and database_url.startswith("os.environ/"):
-                original_dir = os.getcwd()
-                # set the working directory to where this script is
-                sys.path.insert(
-                    0, os.path.abspath("../..")
-                )  # Adds the parent directory to the system path - for litellm local dev
-                import litellm
-                database_url = litellm.get_secret(database_url)
-                os.chdir(original_dir)
-            if database_url is not None and isinstance(database_url, str):
-                os.environ["DATABASE_URL"] = database_url
-        if os.getenv("DATABASE_URL", None) is not None:
-            try:
-                subprocess.run(["prisma"], capture_output=True)
-                is_prisma_runnable = True
-            except FileNotFoundError:
-                is_prisma_runnable = False
-            if is_prisma_runnable:
-                # run prisma db push, before starting server
-                # Save the current working directory
-                original_dir = os.getcwd()
-                # set the working directory to where this script is
-                abspath = os.path.abspath(__file__)
-                dname = os.path.dirname(abspath)
-                os.chdir(dname)
-                try:
-                    subprocess.run(
-                        ["prisma", "db", "push", "--accept-data-loss"]
-                    )  # this looks like a weird edge case when prisma just wont start on render. we need to have the --accept-data-loss
-                finally:
-                    os.chdir(original_dir)
-            else:
-                print(
-                    f"Unable to connect to DB. DATABASE_URL found in environment, but prisma package not found."
-                )
-        if port == 8000 and is_port_in_use(port):
-            port = random.randint(1024, 49152)
-        from litellm.proxy.proxy_server import app
-        uvicorn.run(app, host=host, port=port)  # run uvicorn
-        # if os.name == "nt":
-        # else:
-        #     import gunicorn.app.base
-        #     # Gunicorn Application Class
-        #     class StandaloneApplication(gunicorn.app.base.BaseApplication):
-        #         def __init__(self, app, options=None):
-        #             self.options = options or {}  # gunicorn options
-        #             self.application = app  # FastAPI app
-        #             super().__init__()
-        #             _endpoint_str = (
-        #                 f"curl --location 'http://0.0.0.0:{port}/chat/completions' \\"
-        #             )
-        #             curl_command = (
-        #                 _endpoint_str
-        #                 + """
-        #             --header 'Content-Type: application/json' \\
-        #             --data ' {
-        #             "model": "gpt-3.5-turbo",
-        #             "messages": [
-        #                 {
-        #                 "role": "user",
-        #                 "content": "what llm are you"
-        #                 }
-        #             ]
-        #             }'
-        #             \n
-        #             """
-        #             )
-        #             print()  # noqa
-        #             print(  # noqa
-        #                 f'\033[1;34mLiteLLM: Test your local proxy with: "litellm --test" This runs an openai.ChatCompletion request to your proxy [In a new terminal tab]\033[0m\n'
-        #             )
-        #             print(  # noqa
-        #                 f"\033[1;34mLiteLLM: Curl Command Test for your local proxy\n {curl_command} \033[0m\n"
-        #             )
-        #             print(
-        #                 "\033[1;34mDocs: https://docs.litellm.ai/docs/simple_proxy\033[0m\n"
-        #             )  # noqa
-        #             print(  # noqa
-        #                 f"\033[1;34mSee all Router/Swagger docs on http://0.0.0.0:{port} \033[0m\n"
-        #             )  # noqa
-        #         def load_config(self):
-        #             # note: This Loads the gunicorn config - has nothing to do with LiteLLM Proxy config
-        #             config = {
-        #                 key: value
-        #                 for key, value in self.options.items()
-        #                 if key in self.cfg.settings and value is not None
-        #             }
-        #             for key, value in config.items():
-        #                 self.cfg.set(key.lower(), value)
-        #         def load(self):
-        #             # gunicorn app function
-        #             return self.application
-        #     gunicorn_options = {
-        #         "bind": f"{host}:{port}",
-        #         "workers": num_workers,  # default is 1
-        #         "worker_class": "uvicorn.workers.UvicornWorker",
-        #         "preload": True,  # Add the preload flag,
-        #         "accesslog": "-",  # Log to stdout
-        #         "access_log_format": '%(h)s %(l)s %(u)s %(t)s "%(r)s" %(s)s %(b)s',
-        #     }
-        #     StandaloneApplication(
-        #         app=app, options=gunicorn_options
-        #     ).run()  # Run gunicorn
 if __name__ == "__main__":
-    run_server()

+from litellm.proxy.proxy_server import app, save_worker_config
+import uvicorn
 import random
+import subprocess, json
+import os
+host = "0.0.0.0"
+port = 8000
+api_base = None
+api_version = "2023-07-01-preview"
+model = None
+alias = None
+add_key = None
+headers = None
+save = False
+debug = False
+detailed_debug = False
+temperature = 0.0
+max_tokens = 1000
+request_timeout = 10
+drop_params = True
+add_function_to_prompt = True
+config = None
+max_budget = 100
+telemetry = False
+test = False
+local = False
+num_workers = 1
+test_async = False
+num_requests = 1
+use_queue = False
+health = False
+version = False
 def run_ollama_serve():
     try:
             f"""
             LiteLLM Warning: proxy started with `ollama` model\n`ollama serve` failed with Exception{e}. \nEnsure you run `ollama serve`
         """
+        )
 def is_port_in_use(port):
     import socket
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
         return s.connect_ex(("localhost", port)) == 0
+if model and "ollama" in model and api_base is None:
         run_ollama_serve()
+else:
+    if headers:
+        headers = json.loads(headers)
+    save_worker_config(
+        model=model,
+        alias=alias,
+        api_base=api_base,
+        api_version=api_version,
+        debug=debug,
+        detailed_debug=detailed_debug,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        request_timeout=request_timeout,
+        max_budget=max_budget,
+        telemetry=telemetry,
+        drop_params=drop_params,
+        add_function_to_prompt=add_function_to_prompt,
+        headers=headers,
+        save=save,
+        config=config,
+        use_queue=use_queue,
+    )
+if port == 8000 and is_port_in_use(port):
+    port = random.randint(1024, 49152)
 if __name__ == "__main__":
+    uvicorn.run(app, host=host, port=port)