File size: 3,599 Bytes
257b0ba
d8dbda0
257b0ba
 
 
 
 
f610abe
257b0ba
 
 
 
 
 
 
 
 
 
df1ddcb
 
 
 
aa905fa
df1ddcb
1b9d133
257b0ba
 
 
 
 
 
 
3ca82c1
257b0ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f610abe
 
 
 
6390543
257b0ba
f610abe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257b0ba
f610abe
 
 
 
257b0ba
f610abe
257b0ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f610abe
257b0ba
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import os, pinecone, time, transformers
from datasets import load_dataset
from torch import bfloat16
from langchain.llms import HuggingFacePipeline
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
import ctransformers

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
device = 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

docs = [
    'This is a document', 
    'and another document'
]
embeddings = embed_model.embed_documents(docs)

api_key = os.environ.get('PINECONE_API_KEY')
env_name = os.environ.get('PINECONE_ENV')

pinecone.init(
    api_key=api_key, 
    environment=env_name
)

index_name = 'llama-2-rag'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=len(embeddings[0]),
        metric='cosine'
    )
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

index = pinecone.Index(index_name)

data = load_dataset('jamescalam/llama-2-arxiv-papers-chunked', split='train')
data = data.to_pandas()
batch_size = 32

for i in range(0, len(data), batch_size):
    i_end = min(len(data), i+batch_size)
    batch = data.iloc[i:i_end]
    ids = [f"{x['doi']}-{x['chunk-id']}" for i, x in batch.iterrows()]
    texts = [x['chunk'] for i, x in batch.iterrows()]
    embeds = embed_model.embed_documents(texts)
    metadata = [
        {'text': x['chunk'],
        'source': x['source'],
        'title': x['title']} for i, x in batch.iterrows()
    ]
    index.upsert(vectors=zip(ids, embeds, metadata))

#model_id = "TheBloke/Llama-2-7B-GGML"
#model_id = "TheBloke/Llama-2-7B-chat-GGML"
#model_id = "TheBloke/Llama-2-13B-GGML"
model_id = "TheBloke/Llama-2-13B-chat-GGML"
hf_auth = os.environ.get('HF_AUTH_KEY')

# bnb_config = transformers.BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type='nf4',
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_compute_dtype=bfloat16,
# )
# model_config = transformers.AutoConfig.from_pretrained(model_id, use_auth_token=hf_auth)
# model = transformers.AutoModelForCausalLM.from_pretrained(
#     model_id,
#     trust_remote_code=True,
#     config=model_config,
#     quantization_config=bnb_config,
#     device_map='auto',
#     use_auth_token=hf_auth
# )
# model.eval()

# tokenizer = transformer.AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth)

## Using GGML Llama

config = {
    'max_new_tokens': 512,
    'repetition_penalty': 1.1,
    'temperature': 0.3,
    'stream': True
}
model = ctransformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    model_type='llama',
    gpu_layers=130, # 110 for 7b, 130 for 13b
    hf=True,
    **config
)
tokenizer = ctransformers.AutoTokenizer.from_pretrained(model)

generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    task='text-generation',
    temperature=0.3,
    max_new_tokens=512,
    repetition_penalty=1.1
)
llm = HuggingFacePipeline(pipeline=generate_text)
text_field = 'text'
vectorstore = Pinecone(index, embed_model.embed_query, text_field)
rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=vectorstore.as_retriever()
)

title = 'arxiv-retrieval'

def predict(input):
    return rag_pipeline(input)['result']

gr.Interface(
    fn=predict,
    inputs=['text', 'state'],
    outputs=['chatbot', 'state']
).launch()