Spaces:

ubermenchh
/

arxiv-retrieval

Build error

File size: 3,599 Bytes

257b0ba
d8dbda0
257b0ba
 
 
 
 
f610abe
257b0ba
 
 
 
 
 
 
 
 
 
df1ddcb
 
 
 
aa905fa
df1ddcb
1b9d133
257b0ba
 
 
 
 
 
 
3ca82c1
257b0ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f610abe
 
 
 
6390543
257b0ba
f610abe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257b0ba
f610abe
 
 
 
257b0ba
f610abe
257b0ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f610abe
257b0ba

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import os, pinecone, time, transformers
from datasets import load_dataset
from torch import bfloat16
from langchain.llms import HuggingFacePipeline
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
import ctransformers

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
device = 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

docs = [
    'This is a document', 
    'and another document'
]
embeddings = embed_model.embed_documents(docs)

api_key = os.environ.get('PINECONE_API_KEY')
env_name = os.environ.get('PINECONE_ENV')

pinecone.init(
    api_key=api_key, 
    environment=env_name
)

index_name = 'llama-2-rag'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=len(embeddings[0]),
        metric='cosine'
    )
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

index = pinecone.Index(index_name)

data = load_dataset('jamescalam/llama-2-arxiv-papers-chunked', split='train')
data = data.to_pandas()
batch_size = 32

for i in range(0, len(data), batch_size):
    i_end = min(len(data), i+batch_size)
    batch = data.iloc[i:i_end]
    ids = [f"{x['doi']}-{x['chunk-id']}" for i, x in batch.iterrows()]
    texts = [x['chunk'] for i, x in batch.iterrows()]
    embeds = embed_model.embed_documents(texts)
    metadata = [
        {'text': x['chunk'],
        'source': x['source'],
        'title': x['title']} for i, x in batch.iterrows()
    ]
    index.upsert(vectors=zip(ids, embeds, metadata))

#model_id = "TheBloke/Llama-2-7B-GGML"
#model_id = "TheBloke/Llama-2-7B-chat-GGML"
#model_id = "TheBloke/Llama-2-13B-GGML"
model_id = "TheBloke/Llama-2-13B-chat-GGML"
hf_auth = os.environ.get('HF_AUTH_KEY')

# bnb_config = transformers.BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type='nf4',
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_compute_dtype=bfloat16,
# )
# model_config = transformers.AutoConfig.from_pretrained(model_id, use_auth_token=hf_auth)
# model = transformers.AutoModelForCausalLM.from_pretrained(
#     model_id,
#     trust_remote_code=True,
#     config=model_config,
#     quantization_config=bnb_config,
#     device_map='auto',
#     use_auth_token=hf_auth
# )
# model.eval()

# tokenizer = transformer.AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth)

## Using GGML Llama

config = {
    'max_new_tokens': 512,
    'repetition_penalty': 1.1,
    'temperature': 0.3,
    'stream': True
}
model = ctransformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    model_type='llama',
    gpu_layers=130, # 110 for 7b, 130 for 13b
    hf=True,
    **config
)
tokenizer = ctransformers.AutoTokenizer.from_pretrained(model)

generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    task='text-generation',
    temperature=0.3,
    max_new_tokens=512,
    repetition_penalty=1.1
)
llm = HuggingFacePipeline(pipeline=generate_text)
text_field = 'text'
vectorstore = Pinecone(index, embed_model.embed_query, text_field)
rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=vectorstore.as_retriever()
)

title = 'arxiv-retrieval'

def predict(input):
    return rag_pipeline(input)['result']

gr.Interface(
    fn=predict,
    inputs=['text', 'state'],
    outputs=['chatbot', 'state']
).launch()