Spaces:

yinong333
/

aie4-demo-p1

Sleeping

App Files Files Community

yinong333 commited on Oct 3, 2024

Commit

7a8a241

0 Parent(s):

demo day code updates

Browse files

Files changed (8) hide show

.chainlit/config.toml +84 -0
Dockerfile +11 -0
README.md +2 -0
__pycache__/app.cpython-311.pyc +0 -0
app.py +434 -0
chainlit.md +14 -0
prototype_mvp3.ipynb +0 -0
requirements.txt +106 -0

.chainlit/config.toml ADDED Viewed

	@@ -0,0 +1,84 @@

+[project]
+# Whether to enable telemetry (default: true). No personal data is collected.
+enable_telemetry = true
+# List of environment variables to be provided by each user to use the app.
+user_env = []
+# Duration (in seconds) during which the session is saved when the connection is lost
+session_timeout = 3600
+# Enable third parties caching (e.g LangChain cache)
+cache = false
+# Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
+# follow_symlink = false
+[features]
+# Show the prompt playground
+prompt_playground = true
+# Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
+unsafe_allow_html = false
+# Process and display mathematical expressions. This can clash with "$" characters in messages.
+latex = false
+# Authorize users to upload files with messages
+multi_modal = true
+# Allows user to use speech to text
+[features.speech_to_text]
+    enabled = false
+    # See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
+    # language = "en-US"
+[UI]
+# Name of the app and chatbot.
+name = "Chatbot"
+# Show the readme while the conversation is empty.
+show_readme_as_default = true
+# Description of the app and chatbot. This is used for HTML tags.
+# description = ""
+# Large size content are by default collapsed for a cleaner ui
+default_collapse_content = true
+# The default value for the expand messages settings.
+default_expand_messages = false
+# Hide the chain of thought details from the user in the UI.
+hide_cot = false
+# Link to your github repo. This will add a github button in the UI's header.
+# github = ""
+# Specify a CSS file that can be used to customize the user interface.
+# The CSS file can be served from the public directory or via an external link.
+# custom_css = "/public/test.css"
+# Override default MUI light theme. (Check theme.ts)
+[UI.theme.light]
+    #background = "#FAFAFA"
+    #paper = "#FFFFFF"
+    [UI.theme.light.primary]
+        #main = "#F80061"
+        #dark = "#980039"
+        #light = "#FFE7EB"
+# Override default MUI dark theme. (Check theme.ts)
+[UI.theme.dark]
+    #background = "#FAFAFA"
+    #paper = "#FFFFFF"
+    [UI.theme.dark.primary]
+        #main = "#F80061"
+        #dark = "#980039"
+        #light = "#FFE7EB"
+[meta]
+generated_by = "0.7.700"

Dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+COPY ./requirements.txt ~/app/requirements.txt
+RUN pip install -r requirements.txt
+COPY . .
+CMD ["chainlit", "run", "app.py", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # AIE4 Final Demo Day App
2	+ Literature Review App

__pycache__/app.cpython-311.pyc ADDED Viewed

Binary file (18.8 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,434 @@

+import chainlit as cl
+from Bio import Entrez
+from langchain.tools import StructuredTool
+from langchain_openai import ChatOpenAI
+from pydantic import BaseModel
+from langgraph.graph.message import add_messages
+from langgraph.prebuilt import ToolNode
+from langgraph.graph import StateGraph, END
+from langchain_core.messages import SystemMessage, HumanMessage
+from IPython.display import display, Markdown
+from sentence_transformers import SentenceTransformer, util
+from langchain_core.messages import SystemMessage, HumanMessage
+from langchain.tools import StructuredTool
+from langchain.agents import initialize_agent, Tool, AgentType
+from langchain_openai import ChatOpenAI
+from langgraph.graph.message import add_messages
+from typing import List, TypedDict, Annotated
+import xml.etree.ElementTree as ET
+import uuid
+import re
+from langchain_qdrant import QdrantVectorStore
+from qdrant_client import QdrantClient
+from qdrant_client.http.models import Distance, VectorParams
+from qdrant_client.http.models import Filter, FieldCondition, MatchValue
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.chains import (
+    ConversationalRetrievalChain,
+)
+from langchain.docstore.document import Document
+from langchain.memory import ChatMessageHistory, ConversationBufferMemory
+from transformers import GPT2Tokenizer
+# Load the pre-trained model for embeddings (you can choose a different model if preferred)
+semantic_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
+def pretty_print(message: str) -> None:
+    display(Markdown(f"```markdown\n{message}\n```"))
+# Set your Entrez email for PubMed queries
+Entrez.email = "[email protected]"
+# 1. Define PubMed Search Tool
+class PubMedSearchInput(BaseModel):
+    query: str
+    #max_results: int = 5
+# PubMed search tool using Entrez (now with structured inputs)
+def pubmed_search(query: str, max_results: int = 3):
+    """Search PubMed using Entrez API and return abstracts."""
+    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
+    record = Entrez.read(handle)
+    handle.close()
+    pmids = record["IdList"]
+    # Fetch abstracts
+    handle = Entrez.efetch(db="pubmed", id=",".join(pmids), retmode="xml")
+    records = Entrez.read(handle)
+    handle.close()
+    abstracts = []
+    for record in records['PubmedArticle']:
+        try:
+            title = record['MedlineCitation']['Article']['ArticleTitle']
+            abstract = record['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
+            pmid = record['MedlineCitation']['PMID']
+            abstracts.append({"PMID": pmid, "Title": title, "Abstract": abstract})
+        except KeyError:
+            pass
+    return abstracts
+# Define the AbstractScreeningInput using Pydantic BaseModel
+class AbstractScreeningInput(BaseModel):
+    abstracts: List[dict]
+    criteria: str
+def screen_abstracts_semantic(abstracts: List[dict], criteria: str, similarity_threshold: float = 0.4):
+    """Screen abstracts based on semantic similarity to the criteria."""
+    # Compute the embedding of the criteria
+    criteria_embedding = semantic_model.encode(criteria, convert_to_tensor=True)
+    screened = []
+    for paper in abstracts:
+        abstract_text = paper['Abstract']
+        # Compute the embedding of the abstract
+        abstract_embedding = semantic_model.encode(abstract_text, convert_to_tensor=True)
+        # Compute cosine similarity between the abstract and the criteria
+        similarity_score = util.cos_sim(abstract_embedding, criteria_embedding).item()
+        if similarity_score >= similarity_threshold:
+            screened.append({
+                "PMID": paper['PMID'],
+                "Decision": "Include",
+                "Reason": f"Similarity score {similarity_score:.2f} >= threshold {similarity_threshold}"
+            })
+        else:
+            screened.append({
+                "PMID": paper['PMID'],
+                "Decision": "Exclude",
+                "Reason": f"Similarity score {similarity_score:.2f} < threshold {similarity_threshold}"
+            })
+    return screened
+# Define the PubMed Search Tool as a StructuredTool with proper input schema
+pubmed_tool = StructuredTool(
+    name="PubMed_Search_Tool",
+    func=pubmed_search,
+    description="Search PubMed for research papers and retrieve abstracts. Pass the abstracts (returned results) to another tool.",
+    args_schema=PubMedSearchInput  # Use Pydantic BaseModel for schema
+)
+# Define the Abstract Screening Tool with semantic screening
+semantic_screening_tool = StructuredTool(
+    name="Semantic_Abstract_Screening_Tool",
+    func=screen_abstracts_semantic,
+    description="""Screen PubMed abstracts based on semantic similarity to inclusion/exclusion criteria. Uses cosine similarity between abstracts and criteria. Requires 'abstracts' and 'screening criteria' as input.
+    The 'abstracts' is a list of dictionary with keys as PMID, Title, Abstract.
+    Output a similarity scores for each abstract and send the list of pmids that passed the screening to Fetch_Extract_Tool.""",
+    args_schema=AbstractScreeningInput  # Pydantic schema remains the same
+)
+# 3. Define Full-Text Retrieval Tool
+class FetchExtractInput(BaseModel):
+    pmids: List[str]  # List of PubMed IDs to fetch full text for
+    query: str
+def extract_text_from_pmc_xml(xml_content: str) -> str:
+    """a function to format and clean text from PMC full-text XML."""
+    try:
+        root = ET.fromstring(xml_content)
+        # Find all relevant text sections (e.g., <body>, <sec>, <p>)
+        body_text = []
+        for elem in root.iter():
+            if elem.tag in ['p', 'sec', 'title', 'abstract', 'body']:  # Add more tags as needed
+                if elem.text:
+                    body_text.append(elem.text.strip())
+        # Join all the text elements to form the complete full text
+        full_text = "\n\n".join(body_text)
+        return full_text
+    except ET.ParseError:
+        print("Error parsing XML content.")
+        return ""
+def fetch_and_extract(pmids: List[str], query: str):
+    """Fetch full text from PubMed Central for given PMIDs, split into chunks,
+    store in a Qdrant vector database, and perform RAG for each paper.
+    Retrieves exactly 3 chunks per paper (if available) and generates a consolidated answer for each paper.
+    """
+    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    corpus = {}
+    consolidated_results={}
+    # Fetch records from PubMed Central (PMC)
+    handle = Entrez.efetch(db="pubmed", id=",".join(pmids), retmode="xml")
+    records = Entrez.read(handle)
+    handle.close()
+    full_articles = []
+    for record in records['PubmedArticle']:
+        try:
+            title = record['MedlineCitation']['Article']['ArticleTitle']
+            pmid = record['MedlineCitation']['PMID']
+            pmc_id = 'nan'
+            pmc_id_temp = record['PubmedData']['ArticleIdList']
+            # Extract PMC ID if available
+            for ele in pmc_id_temp:
+                if ele.attributes['IdType'] == 'pmc':
+                    pmc_id = ele.replace('PMC', '')
+                    break
+            # Fetch full article from PMC
+            if pmc_id != 'nan':
+                handle = Entrez.efetch(db="pmc", id=pmc_id, rettype="full", retmode="xml")
+                full_article = handle.read()
+                handle.close()
+                # Split the full article into chunks
+                cleaned_full_article = extract_text_from_pmc_xml(full_article)
+                full_articles.append({
+                    "PMID": pmid,
+                    "Title": title,
+                    "FullText": cleaned_full_article   # Add chunked text
+                })
+            else:
+                full_articles.append({"PMID": pmid, "Title": title, "FullText": "cannot fetch"})
+        except KeyError:
+            pass
+    # Create corpus for each chunk
+    for article in full_articles:
+        article_id = str(uuid.uuid4())
+        corpus[article_id] = {
+            "page_content": article["FullText"],
+            "metadata": {
+                "PMID": article["PMID"],
+                "Title": article["Title"]
+            }
+        }
+    documents = [
+        Document(page_content=content["page_content"], metadata=content["metadata"])
+        for content in corpus.values()
+    ]
+    CHUNK_SIZE = 1000
+    CHUNK_OVERLAP = 200
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP,
+        length_function=len,
+    )
+    split_chunks = text_splitter.split_documents(documents)
+    id_set = set()
+    for document in split_chunks:
+        id = str(uuid.uuid4())
+        while id in id_set:
+            id = uuid.uuid4()
+        id_set.add(id)
+        document.metadata["uuid"] = id
+    LOCATION = ":memory:"
+    COLLECTION_NAME = "pmd_data"
+    VECTOR_SIZE = 384
+    # Initialize Qdrant client
+    qdrant_client = QdrantClient(location=LOCATION)
+    # Create a collection in Qdrant
+    qdrant_client.create_collection(
+        collection_name=COLLECTION_NAME,
+        vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE),
+    )
+    # Initialize the Qdrant vector store without the embedding argument
+    vdb = QdrantVectorStore(
+        client=qdrant_client,
+        collection_name=COLLECTION_NAME,
+        embedding=embedding_model,
+    )
+    # Add embedded documents to Qdrant
+    vdb.add_documents(split_chunks)
+    # Query for each paper and consolidate answers
+    for pmid in pmids:
+        # Correctly structure the filter using Qdrant Filter model
+        qdrant_filter = Filter(
+            must=[
+                FieldCondition(key="metadata.PMID", match=MatchValue(value=pmid))
+            ]
+        )
+        # Custom filtering for the retriever to only fetch chunks related to the current PMID
+        retriever_with_filter = vdb.as_retriever(
+            search_kwargs={
+                "filter": qdrant_filter,  # Correctly passing the Qdrant filter
+                "k": 3  # Retrieve 3 chunks per PMID
+            }
+        )
+        # Reset message history and memory for each query to avoid interference
+        message_history = ChatMessageHistory()
+        memory = ConversationBufferMemory(memory_key="chat_history", output_key="answer", chat_memory=message_history, return_messages=True)
+        # Create the ConversationalRetrievalChain with the filtered retriever
+        qa_chain = ConversationalRetrievalChain.from_llm(
+            ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
+            retriever=retriever_with_filter,
+            memory=memory,
+            return_source_documents=True
+        )
+        # Query the vector store for relevant documents and extract information
+        result = qa_chain({"question": query})
+        # Generate the final answer based on the retrieved chunks
+        generated_answer = result["answer"]  # This contains the LLM's generated answer based on the retrieved chunks
+        generated_source = result["source_documents"]
+        # Consolidate the results for each paper
+        paper_info = {
+            "PMID": pmid,
+            "Title": result["source_documents"][0].metadata["Title"] if result["source_documents"] else "Unknown Title",
+            "Generated Answer": generated_answer,  # Store the generated answer,
+            "Sources": generated_source
+        }
+        consolidated_results[pmid] = paper_info
+    # Return consolidated results for all papers
+    return consolidated_results
+rag_tool = StructuredTool(
+    name="Fetch_Extract_Tool",
+    func=fetch_and_extract,
+    description="""Fetch full-text articles based on PMIDs and store them in a Qdrant vector database.
+    Then extract information based on user's query via Qdrant retriever using a RAG pipeline.
+    Requires list of PMIDs and user query as input.""",
+    args_schema=FetchExtractInput
+)
+tool_belt = [
+    pubmed_tool,
+    semantic_screening_tool,
+    rag_tool
+]
+# Model setup with tools bound
+model = ChatOpenAI(model="gpt-4o", temperature=0)
+model = model.bind_tools(tool_belt)
+# Agent state to handle the messages
+class AgentState(dict):
+    messages: Annotated[list, add_messages]
+    cycle_count: int  # Add a counter to track the number of cycles
+# Function to call the model and handle the flow automatically
+def call_model(state):
+    messages = state["messages"]
+    response = model.invoke(messages)
+    return {"messages": [response], "cycle_count": state["cycle_count"] + 1}  # Increment cycle count
+tool_node = ToolNode(tool_belt)
+# Create the state graph for managing the flow between the agent and tools
+uncompiled_graph = StateGraph(AgentState)
+uncompiled_graph.add_node("agent", call_model)
+uncompiled_graph.add_node("action", tool_node)
+# Set the entry point for the graph
+uncompiled_graph.set_entry_point("agent")
+# Define a function to check if the process should continue
+def should_continue(state):
+    # Check if the cycle count exceeds the limit (e.g., 10)
+    if state["cycle_count"] > 20:
+        print(f"Reached the cycle limit of {state['cycle_count']} cycles. Ending the process.")
+        return END
+    # If there are tool calls, continue to the action node
+    last_message = state["messages"][-1]
+    if last_message.tool_calls:
+        return "action"
+    return END
+# Add conditional edges for the agent to action
+uncompiled_graph.add_conditional_edges("agent", should_continue)
+uncompiled_graph.add_edge("action", "agent")
+# Compile the state graph
+compiled_graph = uncompiled_graph.compile()
+# Function to run the compiled graph asynchronously
+async def run_graph(inputs):
+    final_message_content = None  # Variable to store the final message content
+    async for chunk in compiled_graph.astream(inputs, stream_mode="updates"):
+        for node, values in chunk.items():
+            print(values["messages"])
+            # Check if the message contains content
+            if "messages" in values and values["messages"]:
+                final_message = values["messages"][-1]
+                if hasattr(final_message, 'content'):
+                    final_message_content = final_message.content
+        print("\n\n")
+    if final_message_content:
+        print("Final message content from the last chunk:")
+        print(final_message_content)
+    return final_message_content
+# Chainlit interaction setup
+@cl.on_chat_start
+async def on_chat_start():
+    await cl.Message(content="Welcome! Please provide your PubMed query and screening criteria.").send()
+@cl.on_message
+async def main(message):
+    # Extract query and screening criteria from the user's message
+    user_input = message.content
+    # Build inputs for the agent
+    # system_instructions = SystemMessage(content="""
+    # 1. Use the PubMed search tool to search for papers.
+    # 2. Retrieve the abstracts from the search results.
+    # 3. Screen the abstracts based on the criteria provided by the user. If error happens,retry by feeding in both 'abstracts' and 'screening criteria' as input.
+    #     The 'abstracts' is a list of dictionary with keys as PMID, Title, Abstract (which is extracted from preivous step). For the decisions of include and exclude, give me the similarity score you calculated.
+    # 4. Please provide a full summary at the end of the entire flow executed, detailing the whole process/reasoning for each paper.
+    # The user will provide the search query and screening criteria.
+    # Make sure you finish everything in one step before moving on to next step.
+    # Do not call more than one tool in one action.""")
+    system_instructions = SystemMessage(content="""Please execute the following steps in sequence:
+    1. Use the PubMed search tool to search for papers.
+    2. Retrieve the abstracts from the search results.
+    3. Screen the abstracts based on the criteria provided by the user.
+    4. Fetch full-text articles for all the papers that pass step 3. Store the full-text articles in the Qdrant vector database,
+        and extract the requested information for each article that passed step 3 from the full-text using the query provided by the user.
+    5. Please provide a full summary at the end of the entire flow executed, detailing each paper's title, PMID, and the whole process/screening/reasoning for each paper.
+    The user will provide the search query, screening criteria, and the query for information extraction.
+    Make sure you finish everything in one step before moving on to next step.
+    Do not call more than one tool in one action.""")
+    human_inputs = HumanMessage(content=user_input)
+    inputs = {
+        "messages": [system_instructions, human_inputs],
+        "cycle_count": 0,
+    }
+    # Run the agent flow and capture the response
+    response = await run_graph(inputs)
+    # Display the response in the Chainlit UI
+    if response:
+        await cl.Message(content=response).send()
+    else:
+        await cl.Message(content="Sorry, I couldn't process the request.").send()

chainlit.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# Welcome to Chainlit! 🚀🤖
+Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
+## Useful Links 🔗
+- **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚
+- **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬
+We can't wait to see what you create with Chainlit! Happy coding! 💻😊
+## Welcome screen
+To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.

prototype_mvp3.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,106 @@

+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp==3.10.8
+aiosignal==1.3.1
+annotated-types==0.7.0
+anyio==3.7.1
+async-timeout==4.0.3
+asyncer==0.0.2
+attrs==24.2.0
+bidict==0.23.1
+certifi==2024.8.30
+chainlit==0.7.700
+charset-normalizer==3.3.2
+click==8.1.7
+dataclasses-json==0.5.14
+Deprecated==1.2.14
+distro==1.9.0
+exceptiongroup==1.2.2
+fastapi==0.100.1
+fastapi-socketio==0.0.10
+filetype==1.2.0
+frozenlist==1.4.1
+googleapis-common-protos==1.65.0
+greenlet==3.1.1
+grpcio==1.66.2
+grpcio-tools==1.62.3
+h11==0.14.0
+h2==4.1.0
+hpack==4.0.0
+httpcore==0.17.3
+httpx==0.24.1
+hyperframe==6.0.1
+idna==3.10
+importlib_metadata==8.4.0
+jiter==0.5.0
+jsonpatch==1.33
+jsonpointer==3.0.0
+langchain==0.2.16
+langchain-community==0.2.16
+langchain-core==0.2.38
+langchain-openai==0.1.23
+langchain-qdrant==0.1.4
+langgraph==0.2.19
+langchain-huggingface==0.0.3
+langchain-text-splitters==0.2.4
+langsmith==0.1.121
+Lazify==0.4.0
+marshmallow==3.22.0
+multidict==6.1.0
+mypy-extensions==1.0.0
+nest-asyncio==1.6.0
+numpy==1.26.4
+openai==1.44.0
+opentelemetry-api==1.27.0
+opentelemetry-exporter-otlp==1.27.0
+opentelemetry-exporter-otlp-proto-common==1.27.0
+opentelemetry-exporter-otlp-proto-grpc==1.27.0
+opentelemetry-exporter-otlp-proto-http==1.27.0
+opentelemetry-instrumentation==0.48b0
+opentelemetry-proto==1.27.0
+opentelemetry-sdk==1.27.0
+opentelemetry-semantic-conventions==0.48b0
+orjson==3.10.7
+packaging==23.2
+portalocker==2.10.1
+protobuf==4.25.5
+pydantic==2.9.0
+pydantic-settings==2.5.2
+pydantic_core==2.23.2
+PyJWT==2.9.0
+PyMuPDF==1.24.10
+PyMuPDFb==1.24.10
+python-dotenv==1.0.1
+python-engineio==4.9.1
+python-graphql-client==0.4.3
+python-multipart==0.0.6
+python-socketio==5.11.4
+PyYAML==6.0.2
+qdrant-client==1.11.3
+regex==2024.9.11
+requests==2.32.3
+simple-websocket==1.0.0
+sniffio==1.3.1
+SQLAlchemy==2.0.35
+starlette==0.27.0
+syncer==2.0.3
+tenacity==8.5.0
+tiktoken==0.7.0
+tomli==2.0.1
+tqdm==4.66.5
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+uptrace==1.26.0
+urllib3==2.2.3
+uvicorn==0.23.2
+watchfiles==0.20.0
+websockets==13.1
+wrapt==1.16.0
+wsproto==1.2.0
+yarl==1.13.1
+zipp==3.20.2
+Bio==1.84
+unstructured==0.15.7
+python-pptx==1.0.2
+nltk==3.9.1
+sentence-transformers==3.1.1