Raj-Maharajwala
commited on
Commit
•
1adf2ab
1
Parent(s):
26a8e83
Update README.md
Browse files
README.md
CHANGED
@@ -118,23 +118,25 @@ install(show_locals=True)
|
|
118 |
|
119 |
@dataclass
|
120 |
class ModelConfig:
|
|
|
121 |
model_name: str = "Raj-Maharajwala/Open-Insurance-LLM-Llama3-8B-GGUF"
|
122 |
model_file: str = "open-insurance-llm-q4_k_m.gguf"
|
123 |
-
# model_file: str = "open-insurance-llm-q8_0.gguf"
|
124 |
-
# model_file: str = "open-insurance-llm-q5_k_m.gguf"
|
125 |
-
max_tokens: int = 1000
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
n_ctx: int = 2048
|
133 |
-
n_batch: int = 256
|
134 |
-
verbose: bool = False
|
135 |
-
use_mmap: bool = False
|
136 |
-
use_mlock: bool = True
|
137 |
-
offload_kqv: bool =True
|
|
|
138 |
|
139 |
class CustomFormatter(logging.Formatter):
|
140 |
"""Enhanced formatter with detailed context for different log levels"""
|
@@ -448,6 +450,7 @@ If you use base model or quantized model in your research or applications, pleas
|
|
448 |
title = {Open-Insurance-LLM-Llama3-8B-GGUF},
|
449 |
year = {2024},
|
450 |
publisher = {HuggingFace},
|
|
|
451 |
url = {https://huggingface.co/Raj-Maharajwala/Open-Insurance-LLM-Llama3-8B-GGUF}
|
452 |
}
|
453 |
```
|
|
|
118 |
|
119 |
@dataclass
|
120 |
class ModelConfig:
|
121 |
+
# Optimized parameters for coherent responses and efficient performance on devices like MacBook Air M2
|
122 |
model_name: str = "Raj-Maharajwala/Open-Insurance-LLM-Llama3-8B-GGUF"
|
123 |
model_file: str = "open-insurance-llm-q4_k_m.gguf"
|
124 |
+
# model_file: str = "open-insurance-llm-q8_0.gguf" # 8-bit quantization; higher precision, better quality, increased resource usage
|
125 |
+
# model_file: str = "open-insurance-llm-q5_k_m.gguf" # 5-bit quantization; balance between performance and resource efficiency
|
126 |
+
max_tokens: int = 1000 # Maximum number of tokens to generate in a single output
|
127 |
+
temperature: float = 0.1 # Controls randomness in output; lower values produce more coherent responses (performs scaling distribution)
|
128 |
+
top_k: int = 15 # After temperature scaling, Consider the top 15 most probable tokens during sampling
|
129 |
+
top_p: float = 0.2 # After reducing the set to 15 tokens, Uses nucleus sampling to select tokens with a cumulative probability of 20%
|
130 |
+
repeat_penalty: float = 1.2 # Penalize repeated tokens to reduce redundancy
|
131 |
+
num_beams: int = 4 # Number of beams for beam search; higher values improve quality at the cost of speed
|
132 |
+
n_gpu_layers: int = -2 # Number of layers to offload to GPU; -1 for full GPU utilization, -2 for automatic configuration
|
133 |
+
n_ctx: int = 2048 # Context window size; Llama 3 models support up to 8192 tokens context length
|
134 |
+
n_batch: int = 256 # Number of tokens to process simultaneously; adjust based on available hardware (suggested 512)
|
135 |
+
verbose: bool = False # True for enabling verbose logging for debugging purposes
|
136 |
+
use_mmap: bool = False # Memory-map model to reduce RAM usage; set to True if running on limited memory systems
|
137 |
+
use_mlock: bool = True # Lock model into RAM to prevent swapping; improves performance on systems with sufficient RAM
|
138 |
+
offload_kqv: bool = True # Offload key, query, value matrices to GPU to accelerate inference
|
139 |
+
|
140 |
|
141 |
class CustomFormatter(logging.Formatter):
|
142 |
"""Enhanced formatter with detailed context for different log levels"""
|
|
|
450 |
title = {Open-Insurance-LLM-Llama3-8B-GGUF},
|
451 |
year = {2024},
|
452 |
publisher = {HuggingFace},
|
453 |
+
linkedin = {https://www.linkedin.com/in/raj6800/},
|
454 |
url = {https://huggingface.co/Raj-Maharajwala/Open-Insurance-LLM-Llama3-8B-GGUF}
|
455 |
}
|
456 |
```
|