Spaces:

Tristan107
/

scrum-expert

Running

App Files Files Community

trobet commited on 8 days ago

Commit

daa40a8

1 Parent(s): 97be51f

Scrum paraphrase-MiniLM DeepSeek-R1-Distill-Llama-70B

Browse files

Files changed (7) hide show

.gitattributes +2 -0
.gitignore +5 -0
README.md +24 -5
app.py +92 -0
data/2020-Scrum-Guide-English.pdf +3 -0
data/2020-Scrum-Guide-French.pdf +3 -0
requirements.txt +102 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.docx filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+/.streamlit
+/.venv
+/storage
+/.idea

README.md CHANGED Viewed

@@ -1,13 +1,32 @@
 ---
-title: Scrum Expert
-emoji: 📉
-colorFrom: yellow
 colorTo: blue
 sdk: streamlit
 sdk_version: 1.42.0
 app_file: app.py
 pinned: false
-short_description: Scrum expert focusing on the fr/en Scrum Guides
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Test
+emoji: 📚
+colorFrom: purple
 colorTo: blue
 sdk: streamlit
 sdk_version: 1.42.0
 app_file: app.py
 pinned: false
+short_description: First Space
 ---
+# Introduction
+This is a RAG showcase easily adaptable for any set of documents (mainly pdf, docx, txt, csv).
+# How to run it locally ?
+* Clone the git repository
+* Replace the documents in ./data by your documents
+* Customize the constants at the beginning of app.py
+* Create a .streamlit directory
+* Create a .streamlit/secrets.toml file :
+`openai_key="your-akash-api-key"` (get your free key here : https://chatapi.akash.network/ > Get Started)
+* With .venv activated : `pip install -r requirements.txt`
+* Then `python -m streamlit run app.py`
+***Note*** : Every time you change the embedding model, it's necessary to delete the "storage" directory to rebuild the local vector db
+# How to run it on a new HuggingFace Space ?
+When it runs locally, just commit and push to a new HuggingFace Space. You need to fill your Akash api key as a Secret in the "Settings > Variables and secrets" section of your space.

app.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import logging
+import os
+import time
+import streamlit as st
+import torch
+import sys
+from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings, StorageContext, load_index_from_storage
+from llama_index.core.chat_engine.types import ChatMode
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.llms.openai_like import OpenAILike
+PAGE_TITLE="Votre expert SCRUM"
+CHAT_TITLE="Posez-moi une question sur le guide Scrum 2020 (anglais ou français)"
+SYSTEM_PROMPT="Use the context information provided to assist the user. Mention the origins of the informations at the bottom of the response (file and page)."
+EMBEDDING_MODEL="sentence-transformers/paraphrase-MiniLM-L6-v2" # Fast embedding model
+#EMBEDDING_MODEL="BAAI/bge-m3" # Multilingual large model
+LLM_MODEL="DeepSeek-R1-Distill-Llama-70B" # Available models on : https://chatapi.akash.network/documentation#models
+NB_DOC_CHUNKS_TO_SEND=5
+MAX_NB_TOKENS_IN_RESPONSE=1500
+TEMPERATURE=0.2 # The closer to 1, the less deterministic and the more creative
+API_BASE_URL="https://chatapi.akash.network/api/v1" # Changing this requires to adapt the custom_llm initialization
+# Ajuster le chemin de torch.classes pour éviter le conflit
+torch.classes.__path__ = []
+st.set_page_config(page_title=PAGE_TITLE, layout="centered", initial_sidebar_state="auto", menu_items=None)
+st.title(PAGE_TITLE)
+custom_llm = OpenAILike(model=LLM_MODEL, api_base=API_BASE_URL, api_key=st.secrets["openai_key"], max_tokens=MAX_NB_TOKENS_IN_RESPONSE, temperature=TEMPERATURE)
+Settings.embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
+Settings.llm=custom_llm
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
+# Load and index data
+@st.cache_resource
+def load_data():
+    persist_dir = "./storage"
+    if not os.path.exists(persist_dir):
+        documents = SimpleDirectoryReader(input_dir="./data").load_data()
+        document_index = VectorStoreIndex.from_documents(documents)
+        document_index.storage_context.persist(persist_dir=persist_dir)
+    else:
+        storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
+        document_index = load_index_from_storage(storage_context)
+    return document_index
+start_time = time.time()
+index = load_data()
+end_time = time.time()
+print(f"Time taken for loading embeddings: {end_time - start_time:.4f} seconds")
+start_time = time.time()
+if "messages" not in st.session_state.keys():  # Initialize the chat messages history
+    st.session_state.messages = [
+        {
+            "role": "assistant",
+            "content": CHAT_TITLE,
+        }
+    ]
+if "chat_engine" not in st.session_state.keys():  # Initialize the chat engine
+    st.session_state.chat_engine = index.as_chat_engine(chat_mode=ChatMode.CONTEXT, system_prompt=SYSTEM_PROMPT, similarity_top_k=NB_DOC_CHUNKS_TO_SEND, verbose=True, streaming=True)
+if prompt := st.chat_input("Posez votre question"):  # Prompt for user input and save to chat history
+    st.session_state.messages.append({"role": "user", "content": prompt})
+for message in st.session_state.messages:  # Write message history to UI
+    with st.chat_message(message["role"]):
+        st.write(message["content"])
+# If last message is not from assistant, generate a new response
+if st.session_state.messages[-1]["role"] != "assistant":
+    with st.chat_message("assistant"):
+        start_time = time.time()
+        response_stream = st.session_state.chat_engine.stream_chat(prompt)
+        st.write_stream(response_stream.response_gen)
+        message = {"role": "assistant", "content": response_stream.response}
+        # Add response to message history
+        st.session_state.messages.append(message)
+        end_time = time.time()
+        print(f"Time taken for getting response: {end_time - start_time:.4f} seconds")
+        start_time = time.time()

data/2020-Scrum-Guide-English.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed83eb2378459c9e5da5e695844a24c3770fba33687cafaf0a0683ad5070b3ec
+size 254353

data/2020-Scrum-Guide-French.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0dd5b8f5af1f90ac81caac4194cf3cb73daef99eadfdf3799c669ad796cd3ba3
+size 306931

requirements.txt ADDED Viewed

	@@ -0,0 +1,102 @@

+aiohappyeyeballs==2.4.6
+aiohttp==3.11.12
+aiosignal==1.3.2
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.8.0
+attrs==25.1.0
+beautifulsoup4==4.13.3
+blinker==1.9.0
+cachetools==5.5.1
+certifi==2025.1.31
+charset-normalizer==3.4.1
+click==8.1.8
+colorama==0.4.6
+dataclasses-json==0.6.7
+Deprecated==1.2.18
+dirtyjson==1.0.8
+distro==1.9.0
+embeddings==0.0.8
+filelock==3.17.0
+filetype==1.2.0
+frozenlist==1.5.0
+fsspec==2025.2.0
+gitdb==4.0.12
+GitPython==3.1.44
+greenlet==3.1.1
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.1
+huggingface-hub==0.28.1
+idna==3.10
+Jinja2==3.1.5
+jiter==0.8.2
+joblib==1.4.2
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+llama-index-core==0.12.16.post1
+llama-index-embeddings-huggingface==0.5.1
+llama-index-llms-openai==0.3.18
+llama-index-llms-openai-like==0.3.3
+llama-index-readers-file==0.4.4
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+marshmallow==3.26.1
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.1.0
+mypy-extensions==1.0.0
+narwhals==1.25.2
+nest-asyncio==1.6.0
+networkx==3.4.2
+nltk==3.9.1
+numpy==2.2.2
+openai==1.61.1
+packaging==24.2
+pandas==2.2.3
+pillow==11.1.0
+propcache==0.2.1
+protobuf==5.29.3
+pyarrow==19.0.0
+pydantic==2.10.6
+pydantic_core==2.27.2
+pydeck==0.9.1
+Pygments==2.19.1
+pypdf==5.3.0
+python-dateutil==2.9.0.post0
+pytz==2025.1
+PyYAML==6.0.2
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.22.3
+safetensors==0.5.2
+scikit-learn==1.6.1
+scipy==1.15.1
+sentence-transformers==3.4.1
+setuptools==75.8.0
+six==1.17.0
+smmap==5.0.2
+sniffio==1.3.1
+soupsieve==2.6
+SQLAlchemy==2.0.38
+streamlit==1.42.0
+striprtf==0.0.26
+sympy==1.13.1
+tenacity==9.0.0
+threadpoolctl==3.5.0
+tiktoken==0.8.0
+tokenizers==0.21.0
+toml==0.10.2
+torch==2.6.0
+tornado==6.4.2
+tqdm==4.67.1
+transformers==4.48.3
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2025.1
+urllib3==2.3.0
+watchdog==6.0.0
+wrapt==1.17.2
+yarl==1.18.3