alphrc commited on
Commit
b3061dc
1 Parent(s): 2058c5f

Delete src

Browse files
Files changed (2) hide show
  1. src/run-exo.py +0 -90
  2. src/run-hf.py +0 -4
src/run-exo.py DELETED
@@ -1,90 +0,0 @@
1
- # In this example, a user is running a home cluster with 3 shards.
2
- # They are prompting the cluster to generate a response to a question.
3
- # The cluster is given the question, and the user is given the response.
4
-
5
- from exo.inference.mlx.sharded_utils import get_model_path, load_tokenizer
6
- from exo.inference.shard import Shard
7
- from exo.networking.peer_handle import PeerHandle
8
- from exo.networking.grpc.grpc_peer_handle import GRPCPeerHandle
9
- from exo.topology.device_capabilities import DeviceCapabilities, DeviceFlops
10
- from typing import List
11
- import asyncio
12
- import argparse
13
- import uuid
14
-
15
- models = {
16
- "mlx-community/Meta-Llama-3-8B-Instruct-4bit": Shard(model_id="mlx-community/Meta-Llama-3-8B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=32),
17
- "mlx-community/Meta-Llama-3-70B-Instruct-4bit": Shard(model_id="mlx-community/Meta-Llama-3-70B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=80),
18
- ""
19
- }
20
-
21
- path_or_hf_repo = "mlx-community/Meta-Llama-3-8B-Instruct-4bit"
22
- model_path = get_model_path(path_or_hf_repo)
23
- tokenizer_config = {}
24
- tokenizer = load_tokenizer(model_path, tokenizer_config)
25
-
26
- # we intentionally leave out peer1 to demonstrate equality of nodes in exo.
27
- # there is no "master" node in exo, all nodes are equal and can take on any role.
28
- # peer1 = GRPCPeerHandle(
29
- # "node1",
30
- # "localhost:8080",
31
- # DeviceCapabilities(model="placeholder", chip="placeholder", memory=0)
32
- # )
33
- peer2 = GRPCPeerHandle(
34
- "node2",
35
- "localhost:8081",
36
- DeviceCapabilities(model="placeholder", chip="placeholder", memory=0, flops=DeviceFlops(fp32=0, fp16=0, int8=0))
37
- )
38
- shard = models[path_or_hf_repo]
39
- request_id = str(uuid.uuid4())
40
-
41
- async def run_prompt(prompt: str):
42
- if tokenizer.chat_template is None:
43
- tokenizer.chat_template = tokenizer.default_chat_template
44
- if (
45
- hasattr(tokenizer, "apply_chat_template")
46
- and tokenizer.chat_template is not None
47
- ):
48
- messages = [{"role": "user", "content": prompt}]
49
- prompt = tokenizer.apply_chat_template(
50
- messages, tokenize=False, add_generation_prompt=True
51
- )
52
-
53
- await peer2.connect()
54
- await peer2.global_reset(shard, set(), 2)
55
-
56
- try:
57
- await peer2.send_prompt(shard, prompt, request_id)
58
- except Exception as e:
59
- print(e)
60
-
61
- import time
62
- # poll 10 times per second for result (even though generation is faster, any more than this it's not nice for the user)
63
- previous_length = 0
64
- n_tokens = 0
65
- start_time = time.perf_counter()
66
- while True:
67
- try:
68
- result, is_finished = await peer2.get_inference_result(request_id)
69
- except Exception as e:
70
- continue
71
- await asyncio.sleep(0.1)
72
-
73
- # Print the updated string in place
74
- updated_string = tokenizer.decode(result)
75
- n_tokens = len(result)
76
- print(updated_string[previous_length:], end='', flush=True)
77
- previous_length = len(updated_string)
78
-
79
- if is_finished:
80
- print("\nDone")
81
- break
82
- end_time = time.perf_counter()
83
- print(f"\nDone. Processed {n_tokens} tokens in {end_time - start_time:.2f} seconds ({n_tokens / (end_time - start_time):.2f} tokens/second)")
84
-
85
- if __name__ == "__main__":
86
- parser = argparse.ArgumentParser(description="Run prompt")
87
- parser.add_argument("--prompt", type=str, help="The prompt to run")
88
- args = parser.parse_args()
89
-
90
- asyncio.run(run_prompt(args.prompt))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/run-hf.py DELETED
@@ -1,4 +0,0 @@
1
- from transformers import AutoTokenizer, AutoModelForCausalLM
2
-
3
- tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-405B")
4
- model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-405B")