Raj-Maharajwala commited on
Commit
1adf2ab
1 Parent(s): 26a8e83

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +18 -15
README.md CHANGED
@@ -118,23 +118,25 @@ install(show_locals=True)
118
 
119
  @dataclass
120
  class ModelConfig:
 
121
  model_name: str = "Raj-Maharajwala/Open-Insurance-LLM-Llama3-8B-GGUF"
122
  model_file: str = "open-insurance-llm-q4_k_m.gguf"
123
- # model_file: str = "open-insurance-llm-q8_0.gguf"
124
- # model_file: str = "open-insurance-llm-q5_k_m.gguf"
125
- max_tokens: int = 1000
126
- top_k: int = 15
127
- top_p: float = 0.2
128
- repeat_penalty: float = 1.2
129
- num_beams: int = 4
130
- n_gpu_layers: int = -2 #-2 # -1 for complete GPU usage
131
- temperature: float = 0.1 # Coherent(0.1) vs Creativity(0.8)
132
- n_ctx: int = 2048 # 2048 - 8192 -> As per Llama 3 Full Capacity
133
- n_batch: int = 256
134
- verbose: bool = False
135
- use_mmap: bool = False
136
- use_mlock: bool = True
137
- offload_kqv: bool =True
 
138
 
139
  class CustomFormatter(logging.Formatter):
140
  """Enhanced formatter with detailed context for different log levels"""
@@ -448,6 +450,7 @@ If you use base model or quantized model in your research or applications, pleas
448
  title = {Open-Insurance-LLM-Llama3-8B-GGUF},
449
  year = {2024},
450
  publisher = {HuggingFace},
 
451
  url = {https://huggingface.co/Raj-Maharajwala/Open-Insurance-LLM-Llama3-8B-GGUF}
452
  }
453
  ```
 
118
 
119
  @dataclass
120
  class ModelConfig:
121
+ # Optimized parameters for coherent responses and efficient performance on devices like MacBook Air M2
122
  model_name: str = "Raj-Maharajwala/Open-Insurance-LLM-Llama3-8B-GGUF"
123
  model_file: str = "open-insurance-llm-q4_k_m.gguf"
124
+ # model_file: str = "open-insurance-llm-q8_0.gguf" # 8-bit quantization; higher precision, better quality, increased resource usage
125
+ # model_file: str = "open-insurance-llm-q5_k_m.gguf" # 5-bit quantization; balance between performance and resource efficiency
126
+ max_tokens: int = 1000 # Maximum number of tokens to generate in a single output
127
+ temperature: float = 0.1 # Controls randomness in output; lower values produce more coherent responses (performs scaling distribution)
128
+ top_k: int = 15 # After temperature scaling, Consider the top 15 most probable tokens during sampling
129
+ top_p: float = 0.2 # After reducing the set to 15 tokens, Uses nucleus sampling to select tokens with a cumulative probability of 20%
130
+ repeat_penalty: float = 1.2 # Penalize repeated tokens to reduce redundancy
131
+ num_beams: int = 4 # Number of beams for beam search; higher values improve quality at the cost of speed
132
+ n_gpu_layers: int = -2 # Number of layers to offload to GPU; -1 for full GPU utilization, -2 for automatic configuration
133
+ n_ctx: int = 2048 # Context window size; Llama 3 models support up to 8192 tokens context length
134
+ n_batch: int = 256 # Number of tokens to process simultaneously; adjust based on available hardware (suggested 512)
135
+ verbose: bool = False # True for enabling verbose logging for debugging purposes
136
+ use_mmap: bool = False # Memory-map model to reduce RAM usage; set to True if running on limited memory systems
137
+ use_mlock: bool = True # Lock model into RAM to prevent swapping; improves performance on systems with sufficient RAM
138
+ offload_kqv: bool = True # Offload key, query, value matrices to GPU to accelerate inference
139
+
140
 
141
  class CustomFormatter(logging.Formatter):
142
  """Enhanced formatter with detailed context for different log levels"""
 
450
  title = {Open-Insurance-LLM-Llama3-8B-GGUF},
451
  year = {2024},
452
  publisher = {HuggingFace},
453
+ linkedin = {https://www.linkedin.com/in/raj6800/},
454
  url = {https://huggingface.co/Raj-Maharajwala/Open-Insurance-LLM-Llama3-8B-GGUF}
455
  }
456
  ```