File size: 2,870 Bytes
b247dc4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""Retrieve documentation for a given query."""

from pathlib import Path
from typing import Any
from rich.console import Console
from tqdm import tqdm
import numpy as np
from manifest import Manifest
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

console = Console(soft_wrap=True)

try:
    EMBEDDING_MODEL = Manifest(
        client_name="openaiembedding",
    )
except Exception as e:
    console.print(e)
    console.print(
        "Failed to load embedding model. Likely OPENAI API key is not set. Please set to run document retrieval.",
        style="bold red",
    )


def load_documentation(path: Path) -> dict[str, str]:
    """Load documentation from path."""
    content = {}
    for file in path.glob("**/*.md"):
        with open(file, "r") as f:
            data = f.read()
            key = str(file).replace(str(path), "")
            content[key] = data
    return content


def split_documents(content: dict[str, str]) -> dict[str, Any]:
    """Split documents into chunks."""
    md_splitted_docs = []
    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=[("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
    )
    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n"], chunk_size=500, chunk_overlap=50, length_function=len
    )

    for file, raw_doc in content.items():
        splitted_text = markdown_splitter.split_text(raw_doc)
        for t in splitted_text:
            t.metadata["source"] = file
        md_splitted_docs.extend(splitted_text)

    docs = text_splitter.split_documents(md_splitted_docs)
    docs_as_dict = [doc.dict() for doc in docs]
    return docs_as_dict


def get_embeddings(text: str) -> np.ndarray:
    """Get embeddings."""
    return np.array(EMBEDDING_MODEL.run(text))


def embed_documents(
    chunked_docs: dict[str, Any], key: str = "page_content"
) -> tuple[dict[str, Any], np.ndarray]:
    """Embed documents."""
    all_embeddings = []
    for doc in tqdm(chunked_docs):
        emb = get_embeddings(doc[key])
        doc["embedding"] = emb
        all_embeddings.append(doc["embedding"])
    full_embedding_mat = np.vstack(all_embeddings)
    return chunked_docs, full_embedding_mat


def query_docs(
    query: str,
    docs: dict[str, Any],
    embedding_mat: np.ndarray,
    top_n: int = 10,
    key: str = "page_content",
) -> tuple[list[int], list[str]]:
    """Query documents."""
    query_embedding = get_embeddings(query)
    scores = embedding_mat.dot(query_embedding)
    sorted_indices = np.argsort(scores)[::-1]
    top_n_indices = sorted_indices[:top_n]
    top_n_indices_rev = top_n_indices[::-1]
    returned_docs = []
    for i in top_n_indices_rev:
        returned_docs.append(docs[i][key])
    return top_n_indices_rev.tolist(), returned_docs