NickNYU commited on
Commit
b5d3f34
·
1 Parent(s): 0ff6c12

upload github well compiled files (#1)

Browse files

- upload github well compiled files (a26db82602b82ba45fe7e51dd6073c0961edaebd)

.gitignore ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .DS_Store
2
+ # Byte-compiled / optimized / DLL files
3
+ __pycache__/
4
+ *.py[cod]
5
+ *$py.class
6
+
7
+ # C extensions
8
+ *.so
9
+
10
+ # Distribution / packaging
11
+ .Python
12
+ bin/
13
+ build/
14
+ develop-eggs/
15
+ dist/
16
+ downloads/
17
+ eggs/
18
+ .eggs/
19
+ etc/
20
+ include/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ share/
26
+ var/
27
+ wheels/
28
+ pip-wheel-metadata/
29
+ share/python-wheels/
30
+ *.egg-info/
31
+ .installed.cfg
32
+ *.egg
33
+ MANIFEST
34
+
35
+ # PyInstaller
36
+ # Usually these files are written by a python script from a template
37
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
38
+ *.manifest
39
+ *.spec
40
+
41
+ # Installer logs
42
+ pip-log.txt
43
+ pip-delete-this-directory.txt
44
+
45
+ # Unit test / coverage reports
46
+ htmlcov/
47
+ .tox/
48
+ .nox/
49
+ .coverage
50
+ .coverage.*
51
+ .cache
52
+ nosetests.xml
53
+ coverage.xml
54
+ *.cover
55
+ *.py,cover
56
+ .hypothesis/
57
+ .pytest_cache/
58
+ .ruff_cache
59
+
60
+ # Translations
61
+ *.mo
62
+ *.pot
63
+
64
+ # Django stuff:
65
+ *.log
66
+ local_settings.py
67
+ db.sqlite3
68
+ db.sqlite3-journal
69
+
70
+ # Flask stuff:
71
+ instance/
72
+ .webassets-cache
73
+
74
+ # Scrapy stuff:
75
+ .scrapy
76
+
77
+ # Sphinx documentation
78
+ docs/_build/
79
+
80
+ # PyBuilder
81
+ target/
82
+
83
+ # Jupyter Notebook
84
+ .ipynb_checkpoints
85
+ notebooks/
86
+
87
+ # IPython
88
+ profile_default/
89
+ ipython_config.py
90
+
91
+ # pyenv
92
+ .python-version
93
+
94
+ # pipenv
95
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
97
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
98
+ # install all needed dependencies.
99
+ #Pipfile.lock
100
+
101
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
102
+ __pypackages__/
103
+
104
+ # Celery stuff
105
+ celerybeat-schedule
106
+ celerybeat.pid
107
+
108
+ # SageMath parsed files
109
+ *.sage.py
110
+
111
+ # Environments
112
+ .env
113
+ .venv
114
+ env/
115
+ venv/
116
+ ENV/
117
+ env.bak/
118
+ venv.bak/
119
+ pyvenv.cfg
120
+
121
+ # Spyder project settings
122
+ .spyderproject
123
+ .spyproject
124
+
125
+ # Rope project settings
126
+ .ropeproject
127
+
128
+ # mkdocs documentation
129
+ /site
130
+
131
+ # mypy
132
+ .mypy_cache/
133
+ .dmypy.json
134
+ dmypy.json
135
+
136
+ # Pyre type checker
137
+ .pyre/
138
+
139
+ # Jetbrains
140
+ .idea
141
+ modules/
142
+ *.swp
143
+
144
+ # pipenv
145
+ Pipfile
146
+ Pipfile.lock
147
+
148
+ # pyright
149
+ pyrightconfig.json
.pre-commit-config.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/charliermarsh/ruff-pre-commit
3
+ rev: v0.0.243
4
+ hooks:
5
+ - id: ruff
Makefile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: format lint
2
+
3
+ GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
4
+
5
+ format:
6
+ black .
7
+
8
+ lint:
9
+ mypy .
10
+ black . --check
11
+ ruff check .
12
+
13
+ test:
14
+ pytest tests
app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from llama_index import SimpleDirectoryReader
3
+ from llama_index.node_parser import SimpleNodeParser
4
+ from llama_index.data_structs.node import Node, DocumentRelationship
5
+ from llama_index import VectorStoreIndex
6
+ from llama_index import LLMPredictor, VectorStoreIndex, ServiceContext
7
+ from langchain.llms import AzureOpenAI
8
+ from langchain.embeddings.openai import OpenAIEmbeddings
9
+ from llama_index import LangchainEmbedding, ServiceContext
10
+ from llama_index import StorageContext, load_index_from_storage
11
+
12
+ import logging
13
+ import sys
14
+
15
+
16
+ logging.basicConfig(
17
+ stream=sys.stdout, level=logging.DEBUG
18
+ ) # logging.DEBUG for more verbose output
19
+ logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
20
+
21
+
22
+ def main() -> None:
23
+ documents = SimpleDirectoryReader("./data").load_data()
24
+
25
+ # index = VectorStoreIndex.from_documents(documents)
26
+
27
+ # parser = SimpleNodeParser()
28
+ # nodes = parser.get_nodes_from_documents(documents)
29
+ # index = VectorStoreIndex(nodes)
30
+
31
+ # define embedding
32
+ embedding = LangchainEmbedding(OpenAIEmbeddings(client=None, chunk_size=1))
33
+ # define LLM
34
+ llm_predictor = LLMPredictor(
35
+ llm=AzureOpenAI(
36
+ client=None,
37
+ deployment_name="text-davinci-003",
38
+ model="text-davinci-003",
39
+ )
40
+ )
41
+
42
+ # configure service context
43
+ service_context = ServiceContext.from_defaults(
44
+ llm_predictor=llm_predictor, embed_model=embedding
45
+ )
46
+
47
+ # build index
48
+ index = VectorStoreIndex.from_documents(
49
+ documents,
50
+ service_context=service_context,
51
+ )
52
+
53
+ index.storage_context.persist(persist_dir="./dataset")
54
+ storage_context = StorageContext.from_defaults(persist_dir="./dataset")
55
+ index = load_index_from_storage(
56
+ storage_context=storage_context, service_context=service_context
57
+ )
58
+
59
+ # index.vector_store.persist("./dataset")
60
+ # query with embed_model specified
61
+ query_engine = index.as_query_engine(
62
+ retriever_mode="embedding", verbose=True, service_context=service_context
63
+ )
64
+ response = query_engine.query("请帮忙推荐一杯咖啡给我,我喜欢咖啡因")
65
+ print(response)
66
+
67
+
68
+ if __name__ == "__main__":
69
+ main()
core/__init__.py ADDED
File without changes
core/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (147 Bytes). View file
 
core/__pycache__/lifecycle.cpython-310.pyc ADDED
Binary file (6.82 kB). View file
 
core/__pycache__/logger_factory.cpython-310.pyc ADDED
Binary file (778 Bytes). View file
 
core/__pycache__/test_lifecycle.cpython-310.pyc ADDED
Binary file (2.54 kB). View file
 
core/lifecycle.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import enum
2
+ from abc import ABC, abstractmethod
3
+ from typing import TypeVar, Optional
4
+
5
+ from core import logger_factory
6
+
7
+
8
+ class Initializable(ABC):
9
+ @abstractmethod
10
+ def initialize(self) -> None:
11
+ pass
12
+
13
+
14
+ class Startable(ABC):
15
+ @abstractmethod
16
+ def start(self) -> None:
17
+ pass
18
+
19
+
20
+ class Stoppable(ABC):
21
+ @abstractmethod
22
+ def stop(self) -> None:
23
+ pass
24
+
25
+
26
+ class Disposable(ABC):
27
+ @abstractmethod
28
+ def dispose(self) -> None:
29
+ pass
30
+
31
+
32
+ class LifecycleAware(ABC):
33
+ def __init__(self, state: "LifecycleState") -> None:
34
+ """
35
+ Args:
36
+ state(LifecycleState): lifecycle state
37
+ """
38
+ self.state = state
39
+
40
+ @property
41
+ def get_lifecycle_state(self) -> "LifecycleState":
42
+ return self.state
43
+
44
+
45
+ class Lifecycle(Initializable, Startable, Stoppable, Disposable, LifecycleAware, ABC):
46
+ def __init__(self) -> None:
47
+ self.logger = logger_factory.get_logger(self.__class__.__name__)
48
+ self.lifecycle_state = LifecycleState(lifecycle=self)
49
+
50
+ def initialize(self) -> None:
51
+ if not self.lifecycle_state.can_initialize(self.lifecycle_state.get_phase()):
52
+ self.logger.warning("[{}]cannot initialize".format(self.__class__.__name__))
53
+ return
54
+ self.lifecycle_state.set_phase(LifecyclePhase.INITIALIZING)
55
+ self.do_init()
56
+ self.lifecycle_state.set_phase(LifecyclePhase.INITIALIZED)
57
+
58
+ def start(self) -> None:
59
+ if not self.lifecycle_state.can_start(self.lifecycle_state.get_phase()):
60
+ self.logger.warning("[{}]cannot start".format(self.__class__.__name__))
61
+ return
62
+ self.lifecycle_state.set_phase(LifecyclePhase.STARTING)
63
+ self.do_start()
64
+ self.lifecycle_state.set_phase(LifecyclePhase.STARTED)
65
+
66
+ def stop(self) -> None:
67
+ if not self.lifecycle_state.can_stop(self.lifecycle_state.get_phase()):
68
+ self.logger.warning("[{}]cannot stop".format(self.__class__.__name__))
69
+ return
70
+ self.lifecycle_state.set_phase(LifecyclePhase.STOPPING)
71
+ self.do_stop()
72
+ self.lifecycle_state.set_phase(LifecyclePhase.STOPPED)
73
+
74
+ def dispose(self) -> None:
75
+ if not self.lifecycle_state.can_dispose(self.lifecycle_state.get_phase()):
76
+ self.logger.warning("[{}]cannot dispose".format(self.__class__.__name__))
77
+ return
78
+ self.lifecycle_state.set_phase(LifecyclePhase.DISPOSING)
79
+ self.do_dispose()
80
+ self.lifecycle_state.set_phase(LifecyclePhase.DISPOSED)
81
+
82
+ @abstractmethod
83
+ def do_init(self) -> None:
84
+ pass
85
+
86
+ @abstractmethod
87
+ def do_start(self) -> None:
88
+ pass
89
+
90
+ @abstractmethod
91
+ def do_stop(self) -> None:
92
+ pass
93
+
94
+ @abstractmethod
95
+ def do_dispose(self) -> None:
96
+ pass
97
+
98
+
99
+ class LifecyclePhase(enum.Enum):
100
+ INITIALIZING = 1
101
+ INITIALIZED = 2
102
+ STARTING = 3
103
+ STARTED = 4
104
+ STOPPING = 5
105
+ STOPPED = 6
106
+ DISPOSING = 7
107
+ DISPOSED = 8
108
+
109
+
110
+ class LifecycleController(ABC):
111
+ def can_initialize(self, phase: Optional[LifecyclePhase]) -> bool:
112
+ return phase is None or phase == LifecyclePhase.DISPOSED
113
+
114
+ def can_start(self, phase: Optional[LifecyclePhase]) -> bool:
115
+ return phase is not None and (
116
+ phase == LifecyclePhase.INITIALIZED or phase == LifecyclePhase.STOPPED
117
+ )
118
+
119
+ def can_stop(self, phase: Optional[LifecyclePhase]) -> bool:
120
+ return phase is not None and phase == LifecyclePhase.STARTED
121
+
122
+ def can_dispose(self, phase: Optional[LifecyclePhase]) -> bool:
123
+ return phase is not None and (
124
+ phase == LifecyclePhase.INITIALIZED or phase == LifecyclePhase.STOPPED
125
+ )
126
+
127
+
128
+ LS = TypeVar("LS", bound=Lifecycle)
129
+
130
+
131
+ class LifecycleState(LifecycleController, ABC):
132
+ phase: Optional[LifecyclePhase]
133
+
134
+ def __init__(self, lifecycle: LS) -> None:
135
+ self.phase = None
136
+ self.prev_phase = None
137
+ self.lifecycle = lifecycle
138
+ self.logger = logger_factory.get_logger(__name__)
139
+
140
+ def is_initializing(self) -> bool:
141
+ return self.phase == LifecyclePhase.INITIALIZING
142
+
143
+ def is_initialized(self) -> bool:
144
+ return self.phase == LifecyclePhase.INITIALIZED
145
+
146
+ def is_starting(self) -> bool:
147
+ return self.phase == LifecyclePhase.STARTING
148
+
149
+ def is_started(self) -> bool:
150
+ return self.phase == LifecyclePhase.STARTED
151
+
152
+ def is_stopping(self) -> bool:
153
+ return self.phase == LifecyclePhase.STOPPING
154
+
155
+ def is_stopped(self) -> bool:
156
+ return self.phase == LifecyclePhase.STOPPED
157
+
158
+ def is_disposing(self) -> bool:
159
+ return self.phase == LifecyclePhase.DISPOSING
160
+
161
+ def is_disposed(self) -> bool:
162
+ return self.phase == LifecyclePhase.DISPOSED
163
+
164
+ def get_phase(self) -> Optional[LifecyclePhase]:
165
+ return self.phase
166
+
167
+ def set_phase(self, phase: Optional[LifecyclePhase]) -> None:
168
+ prev = "None"
169
+ if self.phase is not None:
170
+ prev = self.phase.name
171
+ current = "None"
172
+ if phase is not None:
173
+ current = phase.name
174
+ self.logger.info(
175
+ "[setPhaseName][{}]{} --> {}".format(
176
+ self.lifecycle.__class__.__name__,
177
+ prev,
178
+ current,
179
+ )
180
+ )
181
+ self.phase = phase
182
+
183
+ def rollback(self, err: Exception) -> None:
184
+ self.phase = self.prev_phase
185
+ self.prev_phase = None
core/logger_factory.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from logging import handlers
3
+ from typing import Optional
4
+
5
+
6
+ def get_logger(name: str, file_name: Optional[str] = None) -> logging.Logger:
7
+ logger = logging.getLogger(name)
8
+ if file_name is None:
9
+ file_name = "app-default.log"
10
+ handler = handlers.TimedRotatingFileHandler(
11
+ filename=file_name, when="d", backupCount=21, encoding="UTF-8"
12
+ )
13
+ formatter = logging.Formatter("[%(asctime)s][%(levelname)s][%(message)s]")
14
+ handler.setFormatter(formatter)
15
+ logger.addHandler(handler)
16
+ logger.setLevel(logging.INFO)
17
+ # Configure the logger as desired
18
+ # e.g., add handlers, set log levels, etc.
19
+ return logger
core/test_lifecycle.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from unittest import TestCase
3
+
4
+ from core.lifecycle import Lifecycle
5
+
6
+ logging.basicConfig()
7
+
8
+
9
+ class SubLifecycle(Lifecycle):
10
+ def __init__(self) -> None:
11
+ super().__init__()
12
+ self.init_counter = 0
13
+
14
+ def do_init(self) -> None:
15
+ self.init_counter += 1
16
+
17
+ def do_start(self) -> None:
18
+ self.init_counter += 1
19
+
20
+ def do_stop(self) -> None:
21
+ self.init_counter += 1
22
+
23
+ def do_dispose(self) -> None:
24
+ self.init_counter += 1
25
+
26
+
27
+ class TestLifecycle(TestCase):
28
+ def test_initialize(self) -> None:
29
+ ls = SubLifecycle()
30
+ ls.initialize()
31
+ ls.logger.info(ls.lifecycle_state.get_phase())
32
+ ls.start()
33
+ ls.logger.info(ls.lifecycle_state.get_phase())
34
+ ls.stop()
35
+ ls.logger.info(ls.lifecycle_state.get_phase())
36
+ ls.dispose()
37
+ ls.logger.info(ls.lifecycle_state.get_phase())
38
+
39
+ def test_start(self) -> None:
40
+ self.fail()
41
+
42
+ def test_stop(self) -> None:
43
+ self.fail()
44
+
45
+ def test_dispose(self) -> None:
46
+ self.fail()
47
+
48
+ def test_do_init(self) -> None:
49
+ self.fail()
50
+
51
+ def test_do_start(self) -> None:
52
+ self.fail()
53
+
54
+ def test_do_stop(self) -> None:
55
+ self.fail()
56
+
57
+ def test_do_dispose(self) -> None:
58
+ self.fail()
docs/docs.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a47dd9aad6afbb3c118696a8534fea9cc9b0be12746c88ae2dd2777c19423a96
3
+ size 30429
github_retriever.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_hub.github_repo import GithubRepositoryReader, GithubClient
2
+ from llama_index import download_loader, GPTVectorStoreIndex
3
+ from llama_index import LLMPredictor, VectorStoreIndex, ServiceContext
4
+ from langchain.llms import AzureOpenAI
5
+ from langchain.embeddings.openai import OpenAIEmbeddings
6
+ from llama_index import LangchainEmbedding, ServiceContext
7
+ from llama_index import StorageContext, load_index_from_storage
8
+ from dotenv import load_dotenv
9
+ import os
10
+ import pickle
11
+
12
+
13
+ def main() -> None:
14
+ # define embedding
15
+ embedding = LangchainEmbedding(OpenAIEmbeddings(chunk_size=1))
16
+ # define LLM
17
+ llm_predictor = LLMPredictor(
18
+ llm=AzureOpenAI(
19
+ engine="text-davinci-003",
20
+ model_name="text-davinci-003",
21
+ )
22
+ )
23
+
24
+ # configure service context
25
+ service_context = ServiceContext.from_defaults(
26
+ llm_predictor=llm_predictor, embed_model=embedding
27
+ )
28
+ download_loader("GithubRepositoryReader")
29
+ docs = None
30
+ if os.path.exists("docs/docs.pkl"):
31
+ with open("docs/docs.pkl", "rb") as f:
32
+ docs = pickle.load(f)
33
+
34
+ if docs is None:
35
+ github_client = GithubClient(os.getenv("GITHUB_TOKEN"))
36
+ loader = GithubRepositoryReader(
37
+ github_client,
38
+ owner="ctripcorp",
39
+ repo="x-pipe",
40
+ filter_directories=(
41
+ [".", "doc"],
42
+ GithubRepositoryReader.FilterType.INCLUDE,
43
+ ),
44
+ filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
45
+ verbose=True,
46
+ concurrent_requests=10,
47
+ )
48
+
49
+ docs = loader.load_data(branch="master")
50
+
51
+ with open("docs/docs.pkl", "wb") as f:
52
+ pickle.dump(docs, f)
53
+
54
+ index = GPTVectorStoreIndex.from_documents(docs, service_context=service_context)
55
+
56
+ query_engine = index.as_query_engine(service_context=service_context)
57
+ response = query_engine.query("如何使用X-Pipe?")
58
+ print(response)
59
+
60
+
61
+ if __name__ == "__main__":
62
+ load_dotenv()
63
+ main()
langchain/__init__.py ADDED
File without changes
langchain/manager.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import abstractmethod, ABC
2
+
3
+ from langchain.embeddings.base import Embeddings as LCEmbeddings
4
+ from langchain.embeddings.openai import OpenAIEmbeddings
5
+ from langchain.llms import AzureOpenAI
6
+ from langchain.base_language import BaseLanguageModel
7
+
8
+ from core.lifecycle import Lifecycle
9
+
10
+
11
+ class BaseLangChainManager(Lifecycle, ABC):
12
+ def __init__(self) -> None:
13
+ super().__init__()
14
+
15
+ @abstractmethod
16
+ def get_embedding(self) -> LCEmbeddings:
17
+ pass
18
+
19
+ @abstractmethod
20
+ def get_llm(self) -> BaseLanguageModel:
21
+ pass
22
+
23
+
24
+ class LangChainAzureManager(BaseLangChainManager):
25
+ def __init__(self) -> None:
26
+ super().__init__()
27
+
28
+ # Override
29
+ def get_embedding(self) -> LCEmbeddings:
30
+ return OpenAIEmbeddings(client=None, chunk_size=1)
31
+
32
+ # Override
33
+ def get_llm(self) -> BaseLanguageModel:
34
+ return AzureOpenAI(
35
+ deployment_name="text-davinci-003",
36
+ # model_name="text-davinci-003",
37
+ model="text-davinci-003",
38
+ client=None,
39
+ )
llama/__init__.py ADDED
File without changes
llama/context.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index import ServiceContext, LLMPredictor, LangchainEmbedding
2
+ from type import Optional
3
+ from core.lifecycle import Lifecycle
4
+ from langchain.manager import BaseLangChainManager
5
+
6
+
7
+ class ServiceContextManager(Lifecycle):
8
+ service_context: Optional[ServiceContext]
9
+
10
+ def __init__(self, manager: BaseLangChainManager) -> None:
11
+ super().__init__()
12
+ self.manager = manager
13
+ self.service_context = None
14
+
15
+ def get_service_context(self) -> ServiceContext:
16
+ if self.lifecycle_state.is_started():
17
+ raise KeyError(
18
+ "incorrect lifecycle state: {}".format(self.lifecycle_state.phase)
19
+ )
20
+ if self.service_context is None:
21
+ raise ValueError(
22
+ "service context is not ready, check for lifecycle statement"
23
+ )
24
+ return self.service_context
25
+
26
+ def do_init(self) -> None:
27
+ # define embedding
28
+ embedding = LangchainEmbedding(self.manager.get_embedding())
29
+ # define LLM
30
+ llm_predictor = LLMPredictor(llm=self.manager.get_llm())
31
+ # configure service context
32
+ self.service_context = ServiceContext.from_defaults(
33
+ llm_predictor=llm_predictor, embed_model=embedding
34
+ )
35
+
36
+ def do_start(self) -> None:
37
+ pass
38
+
39
+ def do_stop(self) -> None:
40
+ pass
41
+
42
+ def do_dispose(self) -> None:
43
+ pass
44
+
45
+
46
+ class StorageContextManager(Lifecycle):
47
+ def __init__(self, dataset_path: Optional[str] = "./dataset") -> None:
48
+ super().__init__()
49
+ self.dataset_path = dataset_path
50
+
51
+ def do_init(self) -> None:
52
+ pass
53
+
54
+ def do_start(self) -> None:
55
+ pass
56
+
57
+ def do_stop(self) -> None:
58
+ pass
59
+
60
+ def do_dispose(self) -> None:
61
+ pass
llama/data_loader.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ from abc import abstractmethod, ABC
4
+ from typing import Optional, Sequence, List
5
+
6
+ from llama_hub.github_repo import GithubRepositoryReader, GithubClient
7
+ from llama_index import download_loader
8
+ from llama_index.readers.schema.base import Document
9
+
10
+ from core.lifecycle import Lifecycle
11
+
12
+
13
+ class WikiLoader(ABC):
14
+ @abstractmethod
15
+ def load(self) -> List[Document]:
16
+ pass
17
+
18
+
19
+ class GithubLoader(WikiLoader, Lifecycle):
20
+ def __init__(
21
+ self,
22
+ github_owner: Optional[str] = None,
23
+ repo: Optional[str] = None,
24
+ dirs: Optional[Sequence[str]] = None,
25
+ ):
26
+ super().__init__()
27
+ self.owner = (
28
+ github_owner if github_owner is not None else os.environ["GITHUB_OWNER"]
29
+ )
30
+ self.repo = repo if repo is not None else os.environ["GITHUB_REPO"]
31
+ self.dirs = dirs if dirs is not None else [".", "doc"]
32
+
33
+ def load(self) -> List[Document]:
34
+ download_loader("GithubRepositoryReader")
35
+ docs = None
36
+ if os.path.exists("docs/docs.pkl"):
37
+ with open("docs/docs.pkl", "rb") as f:
38
+ docs = pickle.load(f)
39
+
40
+ if docs is not None:
41
+ return docs
42
+
43
+ # otherwise, we download from github and save it locally
44
+ github_client = GithubClient(os.getenv("GITHUB_TOKEN"))
45
+ loader = GithubRepositoryReader(
46
+ github_client,
47
+ # owner="ctripcorp",
48
+ owner=self.owner,
49
+ # repo="x-pipe",
50
+ repo=self.repo,
51
+ filter_directories=(self.dirs, GithubRepositoryReader.FilterType.INCLUDE),
52
+ filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
53
+ verbose=True,
54
+ concurrent_requests=10,
55
+ )
56
+
57
+ docs = loader.load_data(branch="master")
58
+
59
+ with open("docs/docs.pkl", "wb") as f:
60
+ pickle.dump(docs, f)
61
+
62
+ return docs
llama/index.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from core.lifecycle import Lifecycle
2
+ from llama.context import ServiceContextManager
3
+ from llama_index.indices.vector_store import VectorStoreIndex
4
+ from typing import Optional
5
+
6
+
7
+ class IndexManager(Lifecycle):
8
+ index: Optional[VectorStoreIndex]
9
+
10
+ def __init__(self, context_manager: ServiceContextManager) -> None:
11
+ super().__init__()
12
+ self.index = None
13
+ self.context_manager = context_manager
14
+
15
+ def get_index(self) -> Optional[VectorStoreIndex]:
16
+ if not self.lifecycle_state.is_started():
17
+ raise Exception("Lifecycle state is not correct")
18
+ return self.index
llama/vector_storage.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from core.lifecycle import Lifecycle
2
+
3
+
4
+ class VectorStorageManager(Lifecycle):
5
+ def __init__(self) -> None:
6
+ super().__init__()
7
+
8
+ def do_init(self) -> None:
9
+ pass
10
+
11
+ def do_start(self) -> None:
12
+ pass
13
+
14
+ def do_stop(self) -> None:
15
+ pass
16
+
17
+ def do_dispose(self) -> None:
18
+ pass
pyproject.toml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.mypy]
2
+ ignore_missing_imports = "True"
3
+ disallow_untyped_defs = "True"
4
+ exclude = ["notebooks", "build", "examples", "docs", "dataset", "app.py", "github_retriever.py"]
5
+
6
+ [tool.ruff]
7
+ exclude = [
8
+ ".venv",
9
+ "__pycache__",
10
+ ".ipynb_checkpoints",
11
+ ".mypy_cache",
12
+ ".ruff_cache",
13
+ "examples",
14
+ "notebooks",
15
+ "docs",
16
+ "dataset",
17
+ "app.py",
18
+ "github_retriever.py"
19
+ ]
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ llama_index
2
+ llama_hub
3
+ langchain
4
+ dotenv
5
+ ruff
6
+ black
7
+ mypy