# SET UP

In [None]:
!pip install langchain langchain-core==0.2.38 langchain-community==0.2.16 langchain-text-splitters langchain_huggingface==0.0.3 langchain-openai langchain-qdrant qdrant-client tiktoken pymupdf ragas==0.1.14 sentence_transformers datasets pyarrow

In [None]:
import os
import getpass
import pandas as pd
from uuid import uuid4
import re
from tqdm import tqdm
from operator import itemgetter
import json

from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser

from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import Qdrant

from langchain_huggingface import HuggingFaceEmbeddings

from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

import datasets

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss
from sentence_transformers.evaluation import InformationRetrievalEvaluator

In [None]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("LangChain API Key:")
os.environ["LANGCHAIN_PROJECT"] = f"AIM - SDG Midterm - {uuid4().hex[0:8]}"

OpenAI API Key:··········
LangChain API Key:··········


In [None]:
# DATA LOADER
DATA_LINK1 = "https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf"
DATA_LINK2 = "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf"

# CHUNKING CONFIGS
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
SGD_CHUNK_SIZE = 800
SGD_CHUNK_OVERLAP = 100
FINETUNE_CHUNK_SIZE = 1000
FINETUNE_CHUNK_OVERLAP = 75

# RETRIEVER CONFIGS
COLLECTION_NAME = "AI Bill of Rights"

EMBEDDING_MODEL = "text-embedding-3-small"

# SGD CONFIGS
GENERATOR_MODEL = "gpt-3.5-turbo"
CRITIC_MODEL = "gpt-4o-mini"

# FINAL RAG CONFIGS
QA_MODEL = "gpt-4o"

RAG_PROMPT = """\
Given a provided context and question, you must answer the question based only on context.

If you cannot answer the question based on the context - you must say "I don't know".

Context: {context}
Question: {question}
"""

# EVAL CONFIGS
metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

# FINETUNING CONFIGS
BATCH_SIZE = 20
EPOCHS = 5

# HELPER FUNCTIONS

In [None]:
def chunk_documents(unchunked_documents, chunk_size, chunk_overlap):
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=chunk_size,
      chunk_overlap=chunk_overlap,
  )

  return text_splitter.split_documents(unchunked_documents)


def build_retriever(chunked_documents, embeddings, collection_name):
  vectorstore = Qdrant.from_documents(
      documents=chunked_documents,
      embedding=embeddings,
      location=":memory:",
      collection_name=collection_name,
  )

  retriever = vectorstore.as_retriever()

  return retriever


def generate_sgd(generator_llm, critic_llm, embeddings, chunked_documents, sgd_ct=20, with_debugging_logs=False):
  generator = TestsetGenerator.from_langchain(
      generator_llm,
      critic_llm,
      embeddings,
  )

  distributions = {
      simple: 0.5,
      multi_context: 0.4,
      reasoning: 0.1
  }

  testset = generator.generate_with_langchain_docs(chunked_documents, sgd_ct, distributions, with_debugging_logs=with_debugging_logs)

  return testset.to_pandas()


def generate_response_dataset(sgd_df, chain):
  answers = []
  contexts = []
  questions = sgd_df["question"].values.tolist()
  ground_truths = sgd_df["ground_truth"].values.tolist()

  for question in tqdm(questions):
    answer = chain.invoke({"question" : question})
    answers.append(answer["response"].content)
    contexts.append([context.page_content for context in answer["context"]])

  return datasets.Dataset.from_dict({
      "question" : questions,
      "answer" : answers,
      "contexts" : contexts,
      "ground_truth" : ground_truths
  })


# LOAD DOCUMENTS

In [None]:
rag_documents_1 = PyMuPDFLoader(file_path=DATA_LINK1).load()
rag_documents_2 = PyMuPDFLoader(file_path=DATA_LINK2).load()

# BUILD RAG

In [None]:
chunked_rag_documents = chunk_documents(rag_documents_1, CHUNK_SIZE, CHUNK_OVERLAP) + chunk_documents(rag_documents_2, CHUNK_SIZE, CHUNK_OVERLAP)
embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)
retriever = build_retriever(chunked_rag_documents, embeddings, COLLECTION_NAME)

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
qa_llm = ChatOpenAI(model=QA_MODEL)

rag_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | qa_llm, "context": itemgetter("context")}
)
rag_chain.invoke({"question" : "What are the risks of AI and how can we manage it?"})

# rag_chain = (
#     {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
#     | rag_prompt | llm | StrOutputParser()
# )

# SGD

In [None]:
generator_llm = ChatOpenAI(model=GENERATOR_MODEL)
critic_llm = ChatOpenAI(model=CRITIC_MODEL)
sgd_chunked_documents = chunk_documents(rag_documents_1, SGD_CHUNK_SIZE, SGD_CHUNK_OVERLAP) + chunk_documents(rag_documents_2, SGD_CHUNK_SIZE, SGD_CHUNK_OVERLAP)

In [None]:
sgd_df = generate_sgd(generator_llm, critic_llm, embeddings, sgd_chunked_documents, 20)

In [None]:
response_dataset = generate_response_dataset(sgd_df, rag_chain)

100%|██████████| 20/20 [00:30<00:00,  1.51s/it]


In [None]:
results = evaluate(response_dataset, metrics)
pd.DataFrame(list(results.items()), columns=['Metric', 'Value'])

Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

post: trace=9d73448f-43b9-408d-864d-245cce13885b,id=9d73448f-43b9-408d-864d-245cce13885b; patch: trace=769aba0a-a2b7-4958-a831-42914777b5d7,id=769aba0a-a2b7-4958-a831-42914777b5d7


Unnamed: 0,Metric,Value
0,faithfulness,0.735909
1,answer_relevancy,0.774979
2,context_recall,0.783333
3,context_precision,0.780556
4,answer_correctness,0.660136


# FINETUNE

In [None]:
training_documents = chunk_documents(rag_documents_1, FINETUNE_CHUNK_SIZE, FINETUNE_CHUNK_OVERLAP) + chunk_documents(rag_documents_2, FINETUNE_CHUNK_SIZE, FINETUNE_CHUNK_OVERLAP)

In [None]:
id_set = set()
for document in training_documents:
  id = str(uuid4())
  while id in id_set:
    id = uuid4()
  id_set.add(id)
  document.metadata["id"] = id

In [None]:
len(training_documents)

466

In [None]:
training_split_documents = training_documents[:300]
val_split_documents = training_documents[300:400]
test_split_documents = training_documents[400:]

In [None]:
qa_chat_model = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

question_generation_chain = qa_prompt_template | qa_chat_model

In [None]:
def create_questions(documents, n_questions):
  questions = {}
  relevant_docs = {}
  id_set = set()
  for document in tqdm(documents, desc="Processing documents"):
    response = question_generation_chain.invoke({"context": document, "n_questions": n_questions}).content
    questions_list = re.findall(r'\d+\.\s+(.*)', response.strip())
    for question in questions_list:
      id = str(uuid4())
      while id in id_set:
        id = uuid4()
      id_set.add(id)
      questions[id] = question
      relevant_docs[id] = [document.metadata["id"]]

  return questions, relevant_docs


def create_dataset(documents, questions, contexts):
  corpus = {item.metadata["id"] : item.page_content for item in documents}

  dataset = {
      "questions" : questions,
      "relevant_contexts" : contexts,
      "corpus" : corpus
  }

  return dataset

In [None]:
training_questions, training_relevant_contexts = create_questions(training_split_documents, 2)
val_questions, val_relevant_contexts = create_questions(val_split_documents, 2)
test_questions, test_relevant_contexts = create_questions(test_split_documents, 2)

In [None]:
train_dataset = create_dataset(training_split_documents, training_questions, training_relevant_contexts)
val_dataset = create_dataset(val_split_documents, val_questions, val_relevant_contexts)
test_dataset = create_dataset(test_split_documents, test_questions, test_relevant_contexts)

In [None]:
corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

In [None]:
model_id = "Snowflake/snowflake-arctic-embed-m"
model = SentenceTransformer(model_id)

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

evaluator = InformationRetrievalEvaluator(val_dataset['questions'], val_dataset['corpus'], val_dataset['relevant_contexts'])

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/84.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/107 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/738 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_arctic',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50,
)

Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100,Dot Accuracy@1,Dot Accuracy@3,Dot Accuracy@5,Dot Accuracy@10,Dot Precision@1,Dot Precision@3,Dot Precision@5,Dot Precision@10,Dot Recall@1,Dot Recall@3,Dot Recall@5,Dot Recall@10,Dot Ndcg@10,Dot Mrr@10,Dot Map@100
30,No log,No log,0.71,0.86,0.915,0.94,0.71,0.286667,0.183,0.094,0.71,0.86,0.915,0.94,0.828588,0.792325,0.795273,0.71,0.86,0.915,0.94,0.71,0.286667,0.183,0.094,0.71,0.86,0.915,0.94,0.828588,0.792325,0.795273
50,No log,No log,0.76,0.89,0.93,0.97,0.76,0.296667,0.186,0.097,0.76,0.89,0.93,0.97,0.865289,0.831831,0.832582,0.76,0.89,0.93,0.97,0.76,0.296667,0.186,0.097,0.76,0.89,0.93,0.97,0.865289,0.831831,0.832582
60,No log,No log,0.745,0.895,0.93,0.965,0.745,0.298333,0.186,0.0965,0.745,0.895,0.93,0.965,0.860456,0.826478,0.827652,0.745,0.895,0.93,0.965,0.745,0.298333,0.186,0.0965,0.745,0.895,0.93,0.965,0.860456,0.826478,0.827652
90,No log,No log,0.74,0.905,0.94,0.965,0.74,0.301667,0.188,0.0965,0.74,0.905,0.94,0.965,0.858575,0.823792,0.82501,0.74,0.905,0.94,0.965,0.74,0.301667,0.188,0.0965,0.74,0.905,0.94,0.965,0.858575,0.823792,0.82501
100,No log,No log,0.75,0.9,0.935,0.965,0.75,0.3,0.187,0.0965,0.75,0.9,0.935,0.965,0.860952,0.827181,0.828435,0.75,0.9,0.935,0.965,0.75,0.3,0.187,0.0965,0.75,0.9,0.935,0.965,0.860952,0.827181,0.828435
120,No log,No log,0.735,0.9,0.935,0.96,0.735,0.3,0.187,0.096,0.735,0.9,0.935,0.96,0.853306,0.818359,0.820011,0.735,0.9,0.935,0.96,0.735,0.3,0.187,0.096,0.735,0.9,0.935,0.96,0.853306,0.818359,0.820011
150,No log,No log,0.73,0.9,0.935,0.96,0.73,0.3,0.187,0.096,0.73,0.9,0.935,0.96,0.851169,0.81554,0.817223,0.73,0.9,0.935,0.96,0.73,0.3,0.187,0.096,0.73,0.9,0.935,0.96,0.851169,0.81554,0.817223


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub("snowflake-arctic-embed-m-finetuned")

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

'https://huggingface.co/ldldld/snowflake-arctic-embed-m-finetuned/commit/086cd7cd3018bedacef15abc6a38056ca7104587'

## Evaluate and Compare Finetuned vs. Off-the-shelves

In [None]:
finetune_embeddings = HuggingFaceEmbeddings(model_name="finetuned_arctic")

Some weights of BertModel were not initialized from the model checkpoint at finetuned_arctic and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
finetune_retriever = build_retriever(chunked_rag_documents, finetune_embeddings, COLLECTION_NAME+"_FT")
finetune_rag_chain = (
    {"context": itemgetter("question") | finetune_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | qa_llm, "context": itemgetter("context")}
)
finetune_rag_chain.invoke({"question" : "What are the risks of AI and how can we manage it?"})

In [None]:
finetune_sgd_df = generate_sgd(generator_llm, critic_llm, finetune_embeddings, sgd_chunked_documents, 20)

In [None]:
finetune_response_dataset = generate_response_dataset(finetune_sgd_df, finetune_rag_chain)

100%|██████████| 18/18 [00:34<00:00,  1.90s/it]


In [None]:
finetune_results = evaluate(finetune_response_dataset, metrics)

In [None]:
pd.DataFrame(list(results.items()), columns=['Metric', EMBEDDING_MODEL]).merge(
    pd.DataFrame(list(finetune_results.items()), columns=['Metric', "finetuned_arctic"])
)

Unnamed: 0,Metric,text-embedding-3-small,finetuned_arctic
0,faithfulness,0.735909,0.707843
1,answer_relevancy,0.774979,0.752002
2,context_recall,0.783333,0.777778
3,context_precision,0.780556,0.736111
4,answer_correctness,0.660136,0.505122
