Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

.gitattributes +2 -0
README.md +7 -1
triplets2/assertion.log +0 -0
triplets2/azure_openai_usage.log +113 -0
triplets2/books/SILENT WEAPONS for QUIET WARS.pdf +3 -0
triplets2/dev/kg_syngen.py +135 -0
triplets2/dev/optimized_KG2Text +0 -0
triplets2/dev/text2prop_syngen.py +205 -0
triplets2/document_data_chunked_input.json +3 -0
triplets2/k25-infer.py +353 -0
triplets2/k25.py +376 -0
triplets2/openai_usage.log +0 -0
triplets2/optimized_k24-working-42pc.json +0 -0
triplets2/triples-kg2summary.py +132 -0
triplets2/triples-kg_syngen.jsonl +0 -0
triplets2/triples-optimized_text2kg +0 -0
triplets2/triples-train.py +55 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+triplets2/books/SILENT[[:space:]]WEAPONS[[:space:]]for[[:space:]]QUIET[[:space:]]WARS.pdf filter=lfs diff=lfs merge=lfs -text
+triplets2/document_data_chunked_input.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -63,7 +63,8 @@ Output:
 10. Train model using unsloth/Qwen2.5-3B with specific parameters
-Sample:
 ```xml
 <title>Fission Products</title>
 <content>
@@ -132,6 +133,11 @@ Sample:
      - **Neutron Emission:** These decaying fission products can undergo various decay modes that release neutrons, such as alpha decay, beta decay, and γ-decay.
      - **Replenishment of Ne
 </content>
 <type>explanation</type>
 <thinking> To compare the effects of Thouless pumping and the decay of fission products on neutron stream density and flux, we need to analyze their individual contributions and how they interact. </thinking>
 <task> Compare and contrast the effects of the Thouless pumping mechanism and the decay of fission products on the density and flux of neutrons within a nuclear reactor. Discuss how these two phenomena work together to maintain a sustained neutron stream. </task>

 10. Train model using unsloth/Qwen2.5-3B with specific parameters
+### Input
 ```xml
 <title>Fission Products</title>
 <content>
      - **Neutron Emission:** These decaying fission products can undergo various decay modes that release neutrons, such as alpha decay, beta decay, and γ-decay.
      - **Replenishment of Ne
 </content>
+```
+###  Output
+```xml
 <type>explanation</type>
 <thinking> To compare the effects of Thouless pumping and the decay of fission products on neutron stream density and flux, we need to analyze their individual contributions and how they interact. </thinking>
 <task> Compare and contrast the effects of the Thouless pumping mechanism and the decay of fission products on the density and flux of neutrons within a nuclear reactor. Discuss how these two phenomena work together to maintain a sustained neutron stream. </task>

triplets2/assertion.log ADDED Viewed

File without changes

triplets2/azure_openai_usage.log ADDED Viewed

	@@ -0,0 +1,113 @@

+Retrying request to /chat/completions in 0.958111 seconds
+Retrying request to /chat/completions in 1.745523 seconds
+Retrying request to /chat/completions in 0.820110 seconds
+Retrying request to /chat/completions in 1.505952 seconds
+HTTP Request: POST https://api.freegpt.today/v1chat/completions "HTTP/1.1 404 Not Found"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 500 Internal Server Error"
+Retrying request to /completions in 0.947293 seconds
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 500 Internal Server Error"
+Retrying request to /completions in 1.887936 seconds
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 500 Internal Server Error"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
+Retrying request to /completions in 0.814191 seconds
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
+Retrying request to /completions in 1.747078 seconds
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
+Retrying request to /completions in 0.909832 seconds
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
+Retrying request to /completions in 1.592161 seconds
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 500 Internal Server Error"
+Retrying request to /completions in 0.837366 seconds
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
+Retrying request to /completions in 1.775891 seconds
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
+Retrying request to /completions in 0.964984 seconds
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
+Retrying request to /completions in 1.653462 seconds
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"
+HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"

triplets2/books/SILENT WEAPONS for QUIET WARS.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0565c7ac700ebd2974d3d7d17cb8ad5bcded4ad0527f0b3bfec492d5c3d8017c
+size 1979165

triplets2/dev/kg_syngen.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import json
+from pprint import pprint
+import queue
+import random
+import threading
+# import random
+# import pandas as pd
+import dspy
+from dspy.teleprompt import BootstrapFewShot
+from dspy.evaluate import Evaluate
+from tqdm import tqdm
+from opus.modules.text2kg import Text2KGModule
+from opus.modules.text2proposition import Text2PropositionModule, PropositionExamples
+# from utils.dataframe_utils import dataframe_to_documents
+# from opus.utils.dataframe_utils import dataframe_to_documents
+# from opus.metric.base import factuality_metric
+turbo = dspy.OpenAI(model='gpt-3.5-turbo', temperature=0.1, presence_penalty=0.2, max_tokens=1200, top_p=0.89, api_base='http://localhost:6000/v1/', api_key='asdf', timeout=200)
+# turbo = dspy.OpenAI(model='mixtral-turbo', temperature=0.5, max_tokens=1024, api_base='http://localhost:6000/v1/', api_key='asdf')#, timeout=200)
+dspy.settings.configure(lm=turbo)
+NUM_THREADS=10
+class AssessPrediction(dspy.Signature):
+    """Assess wether this is a high quality Proposition extract and that propositions are formated correctly"""
+    text = dspy.InputField()
+    proposition = dspy.InputField()
+    assessment_answer = dspy.OutputField(desc="Yes or No")
+program = dspy.ChainOfThought(AssessPrediction)
+def factuality_metric(gold, pred, trace=None):
+    # print(f"gold: {gold}")
+    # print(f"pred: {pred}")
+    # input("Press enter to continue")
+    # _program_output = dspy. (text=gold.passage, proposition=pred)
+    _program_output = dspy.predict
+    propostions = "(" + ", ".join(f"'{item}'" for item in pred) + ")"
+    program = dspy.ChainOfThought(AssessPrediction)
+    assessment = program(text=gold.passage, proposition=propostions)
+    score = assessment.assessment_answer.split()[0].lower() == 'yes'
+    return score
+from opus.utils.load_documents import load_data
+CoT = Text2PropositionModule()
+asessor = dspy.ChainOfThought(AssessPrediction)
+def process_document(documents):
+    _response_example_obj = CoT(passage=documents.text)
+    pprint(_response_example_obj)
+    response_example = "(" + ", ".join(f"'{item}'" for item in _response_example_obj) + ")"
+    _program_output = asessor(text=documents.text, proposition=response_example)
+    assessment =  _program_output.assessment_answer.lower() == "yes"
+    pprint(f"assessment {assessment}")
+    return _response_example_obj, assessment
+# Function to process a single document
+# Modified main loop to accept tqdm progress bar instance
+def main_loop(q, output_file, pbar):
+    while True:
+        documents = q.get()
+        if documents is None:
+            q.task_done()
+            break  # Sentinel value to end thread
+        # Call the function in the main loop
+        _response_example_obj, assessment = process_document(documents)
+        with open(output_file, "a") as f:
+            # json.dump({"epoch": epoch, "summary": summary}, f)
+            # add metadata, text, kg, and summary to the file
+            json.dump({
+                "metadata": documents.metadata,
+                "text": documents.text,
+                "kg": _response_example_obj,
+                "assessment": assessment,
+                }
+                ,
+            f)
+            f.write("\n")  # Ensure each summary is on a new line
+        q.task_done()
+        pbar.update(1)  # Update progress bar
+# Function to initiate processing with threading and tqdm
+def process_documents_with_threads(document_data_list, num_threads=5, output_file="undefined-syngen.jsonl"):
+    q = queue.Queue()
+    threads = []
+    # Initialize tqdm progress bar
+    pbar = tqdm(total=len(document_data_list))
+    # Start threads
+    for _ in range(num_threads):
+        t = threading.Thread(target=main_loop, args=(q, output_file, pbar))
+        t.start()
+        threads.append(t)
+    # Add documents to queue
+    for document in document_data_list:
+        q.put(document)
+    # Add sentinel values to queue to signal threads to stop
+    for _ in range(num_threads):
+        q.put(None)
+    # Wait for all threads to complete
+    for t in threads:
+        t.join()
+    pbar.close()  # Close progress bar
+NUM_THREADS = 20
+try:
+    DocumentDataArray = load_data('./data/books')
+    random.shuffle(DocumentDataArray)
+    process_documents_with_threads(DocumentDataArray, num_threads=NUM_THREADS, output_file="kg_syngen.jsonl")
+except KeyboardInterrupt:
+    print("Processing stopped by user.")

triplets2/dev/optimized_KG2Text ADDED Viewed

The diff for this file is too large to render. See raw diff

triplets2/dev/text2prop_syngen.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import json
+from pprint import pprint
+import queue
+import random
+import threading
+# import random
+# import pandas as pd
+import dspy
+from dspy.teleprompt import BootstrapFewShot
+from dspy.evaluate import Evaluate
+from tqdm import tqdm
+from opus.modules.text2kg import Text2KGModule
+from opus.modules.text2proposition import Text2PropositionModule, PropositionExamples
+# from utils.dataframe_utils import dataframe_to_documents
+# from opus.utils.dataframe_utils import dataframe_to_documents
+# from opus.metric.base import factuality_metric
+turbo = dspy.OpenAI(model='gpt-3.5-turbo', temperature=0.1, presence_penalty=0.2, max_tokens=1200, top_p=0.89, api_base='http://localhost:6000/v1/', api_key='asdf', timeout=200)
+# turbo = dspy.OpenAI(model='mixtral-turbo', temperature=0.5, max_tokens=1024, api_base='http://localhost:6000/v1/', api_key='asdf')#, timeout=200)
+dspy.settings.configure(lm=turbo)
+NUM_THREADS=10
+class AssessPrediction(dspy.Signature):
+    """Assess wether this is a high quality Proposition extract and that propositions are formated correctly"""
+    text = dspy.InputField()
+    proposition = dspy.InputField()
+    assessment_answer = dspy.OutputField(desc="Yes or No")
+program = dspy.ChainOfThought(AssessPrediction)
+def factuality_metric(gold, pred, trace=None):
+    # print(f"gold: {gold}")
+    # print(f"pred: {pred}")
+    # input("Press enter to continue")
+    # _program_output = dspy. (text=gold.passage, proposition=pred)
+    _program_output = dspy.predict
+    propostions = "(" + ", ".join(f"'{item}'" for item in pred) + ")"
+    program = dspy.ChainOfThought(AssessPrediction)
+    assessment = program(text=gold.passage, proposition=propostions)
+    score = assessment.assessment_answer.split()[0].lower() == 'yes'
+    return score
+from opus.utils.load_documents import load_data
+# save_document_data_list_to_file, load_document_data_list_from_file
+# def train():
+#     # documents = load_data('./data/books')
+#     # save_document_data_list_to_file(documents, './data/text2proposition-books.json')
+#     # documents = load_document_data_list_from_file('./data/text2proposition-books.json')
+#     # # Get the documents from the parquet file
+#     # documents = dataframe_to_documents(df)
+#     # random.shuffle(documents)
+#     # # # Split documents into train, validation, and test sets
+#     # split1, split2 = len(documents) // 3, 2 * len(documents) // 3
+#     # train, validation, test_set = documents[:split1], documents[split1:split2], documents[split2:]
+#     # for each string in array _prop, merge all the arrays together and separate t hem by a comman join them thanks
+#     _PropositionExamples = [dspy.Example(proposition='['+','.join(prop['propositions'])+']', passage=prop['passage']) for prop in PropositionExamples]
+#     _PropositionExamples = [dspy.Example(**prop).with_inputs('passage') for prop in PropositionExamples]
+#     # train = _PropositionExamples[:10]
+#     # validation = _PropositionExamples[:10]
+#     # test_set = test_set[:15]
+#     # train = [
+#     #     # dspy.Example(proposition=','.join(prop['proposition']), passage
+#     #     dspy.Example(proposition="fuck you", passage="hello world").with_inputs("passage")
+#     # ]
+#     train = _PropositionExamples[:5]
+#     validation = _PropositionExamples[5:]
+#     pprint(train)
+#     print("val")
+#     pprint(validation)
+#     teleprompter = BootstrapFewShot(metric=factuality_metric, max_bootstrapped_demos=3, max_labeled_demos=16)
+#     optimized_prop = teleprompter.compile(Text2PropositionModule(), trainset=train, valset=validation)
+#     # # # print(f"optimized_text2kg: {optimized_text2kg}")
+#     yes_responses = []
+#     # If you want to see the documents and propositions stored for "YES" responses
+#     for doc, prop in yes_responses:
+#         print("\nDocument:", doc.text)
+#         print("Proposition:", prop)
+# if __name__ == "__main__":
+#     train()
+CoT = Text2PropositionModule()
+asessor = dspy.ChainOfThought(AssessPrediction)
+# Function to process a single document
+# Modified main loop to accept tqdm progress bar instance
+def main_loop(q, output_file, pbar):
+    while True:
+        documents = q.get()
+        if documents is None:
+            q.task_done()
+            break  # Sentinel value to end thread
+        _response_example_obj = CoT(passage=documents.text)
+            # print(f"Document: {document.text}\nProposition:")
+        pprint(_response_example_obj)
+        # response_example = ', '.join(response_example)
+        response_example = "(" + ", ".join(f"'{item}'" for item in _response_example_obj) + ")"
+        # turn response [] array into a , commas separated
+        _program_output = asessor(text=documents.text, proposition=response_example)
+        assessment =  _program_output.assessment_answer.lower() == "yes"
+        pprint(f"assessment {assessment}")
+        # Save the summary safely to an output file
+        with open(output_file, "a") as f:
+            # json.dump({"epoch": epoch, "summary": summary}, f)
+            # add metadata, text, kg, and summary to the file
+            json.dump({
+                "metadata": documents.metadata,
+                "text": documents.text,
+                "proposition": _response_example_obj,
+                    # 'page_label':
+                    # '2'
+                    # 'file_name':
+                    # 'Property1_Turner_Dec2014.pdf'
+                    # 'file_path':
+                    # '/home/fullstack/dev/licensed/apps/opus/data/books/Property1_Turner_Dec2014.pdf'
+                    # 'file_type':
+                    # 'application/pdf'
+                    # 'file_size':
+                    # 3652131
+                    # 'creation_date':
+                    # '2024-03-12'
+                    # 'last_modified_date':
+                    # '2024-03-12'
+                    # len():
+                    # 7
+                    #
+                }
+                ,
+            f)
+            f.write("\n")  # Ensure each summary is on a new line
+        q.task_done()
+        pbar.update(1)  # Update progress bar
+# Function to initiate processing with threading and tqdm
+def process_documents_with_threads(document_data_list, num_threads=5, output_file="./output/propositions.jsonl"):
+    q = queue.Queue()
+    threads = []
+    # Initialize tqdm progress bar
+    pbar = tqdm(total=len(document_data_list))
+    # Start threads
+    for _ in range(num_threads):
+        t = threading.Thread(target=main_loop, args=(q, output_file, pbar))
+        t.start()
+        threads.append(t)
+    # Add documents to queue
+    for document in document_data_list:
+        q.put(document)
+    # Add sentinel values to queue to signal threads to stop
+    for _ in range(num_threads):
+        q.put(None)
+    # Wait for all threads to complete
+    for t in threads:
+        t.join()
+    pbar.close()  # Close progress bar
+NUM_THREADS = 20
+try:
+    DocumentDataArray = load_data('./data/books')
+    random.shuffle(DocumentDataArray)
+    process_documents_with_threads(DocumentDataArray, num_threads=NUM_THREADS, output_file="propostions_syngen.jsonl")
+except KeyboardInterrupt:
+    print("Processing stopped by user.")

triplets2/document_data_chunked_input.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d32b9544f2fcffb03b012ccaaa98eba606b534aa011d1c14385f3fc71859699
+size 36256148

triplets2/k25-infer.py ADDED Viewed

	@@ -0,0 +1,353 @@

+from doctest import Example
+from operator import is_
+from pprint import pp
+from typing import List
+import random
+from marshmallow import missing
+import pandas as pd
+from pydantic import BaseModel, Field
+import dspy
+import dspy
+from dspy.teleprompt import BootstrapFewShot
+from dspy.evaluate import Evaluate, metrics
+from dspy.functional import TypedPredictor
+class KGTriple(BaseModel):
+    # The subject of the triple, representing the entity or concept the triple is about.
+    subject: str = Field(..., description="The subject of the Knowledge Graph triple.")
+    # The predicate of the triple, representing the relationship or attribute of the subject.
+    predicate: str = Field(..., description="The predicate of the Knowledge Graph triple, defining the type of relationship between the subject and object.")
+    # The object of the triple, representing the entity or concept that is related to the subject by the predicate.
+    object: str = Field(..., description="The object of the Knowledge Graph triple, representing the entity or value that the subject is related to via the predicate.")
+class KG(BaseModel):
+    triples: List[KGTriple]
+class Document(BaseModel):
+    text: str
+    page_label: str
+    file_name: str
+    file_type: str
+    file_size: int
+    creation_date: str
+    kg: str
+    summary: str
+    # answer: str
+    # input_text: Optio
+class Text2KG(dspy.Signature):
+    """
+    The task involves analyzing an input text to extract a Knowledge Graph (KG) represented by triples in the format ("subject", "predicate", "object"). Each triple should convey a specific fact or relationship derived from the text.
+    Your goal is to distill complex sentences into fundamental components, focusing on extracting succinct and atomic pieces of information. Instead of using entire sentences as triples, break down the information into the most granular facts possible. This process might involve decomposing a single sentence into multiple triples, each highlighting a distinct piece of information or relationship.
+    Be meticulous in identifying subjects, predicates, and objects, ensuring that each triple accurately reflects the relationships within the text. This extraction task does not require verifying the factual accuracy of the content; your primary objective is to transform the text into a structured format that accurately captures the key points and relationships presented.
+    ### Guidelines:
+    - Identify key entities and actions within the text as subjects and predicates.
+    - Distill complex information into multiple, atomic triples when necessary.
+    - Ensure that triples are independent and self-contained, conveying clear relationships.
+    - Avoid redundancy and ensure that each triple contributes unique information to the KG.
+    - triples format is a long string of ('subject', 'predicate', 'object'), ('subject', 'predicate', 'object'), ...
+    """
+    input_text = dspy.InputField()
+    triples = dspy.OutputField(desc="('subject', 'predicate', 'object'), ...")
+class KG2Text(dspy.Signature):
+    """
+    Given a set of Knowledge Graph (KG) triples, your task is to organize and present the information from these triples in a detailed, factual, and evidence-based document. The document should closely adhere to the provided triples, logically arranging the information to ensure clarity and factual integrity.
+    Avoid creating a narrative or adding interpretive elements. Instead, focus on structuring the KG's factual content into a coherent, detailed document that directly reflects the relationships and entities within the KG.
+    ### Guidelines:
+    - Present the KG triples in a logically structured manner, maintaining the factual content.
+    - Do not infer or add information not explicitly contained within the KG triples.
+    - Arrange the triples to enhance understanding, grouping related facts where applicable.
+    - Use clear and concise language to ensure the factual basis of the KG is communicated effectively.
+    """
+    input_kg = dspy.InputField(desc="The KG triples to summarize")
+    summary = dspy.OutputField(desc="The synthesized narrative or description")
+class SummaryCritic(dspy.Signature):
+    """Point out what the Summary is missing from the text
+    """
+    input_text = dspy.InputField(desc="The text to generate KG from")
+    kg = dspy.InputField(desc="The generated KG in triples format surrounded by quotes")
+    summary = dspy.InputField(desc="The synthesized narrative or description")
+    summary_critique = dspy.OutputField(desc="The critique of the summary and details of any missing information")
+    summary_rewrite = dspy.OutputField(desc="The rewritten summary with the missing information added")
+    missing_kg = dspy.OutputField(desc='triples in the format ("subject", "predicate", "object")')
+# kg_string = ", ".join(f'("{triple.subject}", "{triple.predicate}", "{triple.object}")' for triple in doc.kg.triples)
+class Text2Triple(dspy.Signature):
+    """Convert"""
+    text: str = dspy.InputField(desc="Text to extract triple pairs") #desc="The first KG in the comparison")
+    # missing_kg: str = dspy.InputField(desc="The second KG")
+    object: List[KGTriple] = dspy.OutputField(desc="triples in the format ('subject', 'predicate', 'object')")
+    # cot_predictor = TypedPredictor(CodeSignature)
+    # _kg: KGTripleList = cot_predictor(string=example.kg).kg
+class Text2KGModule(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.text2kg = dspy.Predict(Text2KG)
+        self.kg2text = dspy.ChainOfThought(KG2Text)
+        self.critic = dspy.Predict(SummaryCritic)
+        self.code_signature = TypedPredictor(Text2Triple)
+    def forward(self, text):
+        kg = self.text2kg(input_text=text)
+        # text2kg = dspy.Predict(Text2KG)
+        kg2text = dspy.ChainOfThought(KG2Text)
+        critic = dspy.Predict(SummaryCritic)
+        text2tripleObj = TypedPredictor(Text2Triple)
+        summary = kg2text(input_kg=kg.triples).summary
+        critique = critic(input_text=text, kg=kg.triples, summary=summary)
+        missing_kg = critique.missing_kg
+        # # rewritten_summary = critique.summary_rewrite
+        try:
+            triples = text2tripleObj(text=kg.triples).object
+            missing_triples = text2tripleObj(text=missing_kg).object
+            triples_string = ", ".join([f'("{triple.subject}", "{triple.predicate}", "{triple.object}")' for triple in triples])
+            kg.triples = triples_string
+            missing_triples_string = ", ".join([f'("{triple.subject}", "{triple.predicate}", "{triple.object}")' for triple in missing_triples])
+            kg.triples = triples_string + ", " + missing_triples_string
+        except Exception as e:
+            return kg
+        return kg
+        # return self.prog(input_text=text)
+turbo = dspy.OpenAI(model='gpt-3.5-turbo', temperature=0.1, max_tokens=1200, top_p=0.89, api_base='https://freegpt.today/v1', api_key='asdf', timeout=200)
+dspy.settings.configure(lm=turbo)
+text2kg = Text2KGModule()
+file_name='./optimized_k24-working-42pc.json'
+text2kg.load(file_name)
+res = text2kg(text="The quick brown fox jumps over the lazy dog")
+pp(res)
+# import queue
+# import threading
+# import json
+# import time
+# from llama_index_client import SentenceSplitter
+# from pydantic import BaseModel
+# import json
+# from typing import List
+# import dspy
+# from pydantic import BaseModel, parse_obj_as
+# from llama_index.core import SimpleDirectoryReader
+# from dspy import dsp
+# from tqdm import tqdm
+# class DocumentData(BaseModel):
+#     """# Example data from your output
+#     data_example = {
+#         "uploaded_file_id": "ExampleUploadedFileID",
+#         "text": "Beyond Language Models: Byte Models are Digital World Simulators\nShangda Wu1 2* Xu Tan1* Zili Wang3Rui Wang1Xiaobing Li2Maosong Sun2 4\nhttps://byte-gpt.github.io\nAbstract\nTraditional deep learning often overlooks bytes,\nthe basic units of the digital world, where all\nforms of information and operations are encoded\nand manipulated in binary format. Inspired by\nthe success of next token prediction in natural lan-\nguage processing, we introduce bGPT, a model\nwith next byte prediction to simulate the digi-\ntal world. bGPT matches specialized models in\nperformance across various modalities, including\ntext, audio, and images, and offers new possibil-\nities for predicting, simulating, and diagnosing\nalgorithm or hardware behaviour. It has almost\nflawlessly replicated the process of converting\nsymbolic music data, achieving a low error rate of\n0.0011 bits per byte in converting ABC notation\nto MIDI format. In addition, bGPT demonstrates\nexceptional capabilities in simulating CPU be-\nhaviour, with an accuracy exceeding 99.99% in\nexecuting various operations. Leveraging next\nbyte prediction, models like bGPT can directly\nlearn from vast binary data, effectively simulating\nthe intricate patterns of the digital world.",
+#         "user_id": "ExampleUserID",
+#         "metadata": json.dumps({
+#             "page_label": "1",
+#             "file_name": "Beyond Language Models: Byte Models are Digital World Simulators.pdf",
+#             "file_path": "/home/fullstack/dev/licensed/apps/dspytest/notes/Beyond Language Models: Byte Models are Digital World Simulators.pdf",
+#             "file_type": "application/pdf",
+#             "file_size": 771513,
+#             "creation_date": "2024-03-05",
+#             "last_modified_date": "2024-03-03"
+#         })
+#     }
+#     """
+#     # uploaded_file_id: str
+#     text: str
+#     # user_id: str
+#     metadata: dict  # Since you're storing JSON, let's keep this as a dictMetadata
+# # from llama_index.node_parser import SimpleNodeParser
+# from llama_index.core.node_parser import SimpleNodeParser
+# # Assuming DocumentData is already defined
+# class DocumentData(BaseModel):
+#     text: str
+#     metadata: dict
+# def clean_nul_chars(s):
+#     return s.replace('\x00', '')
+# def load_data(directory="./books", text_splitter_cls=SentenceSplitter, chunk_size=1000, chunk_overlap=0):
+#     documents = SimpleDirectoryReader(input_dir=directory, exclude=[]).load_data()
+#     if len(documents) == 0:
+#         raise Exception("No documents found in the specified directory.")
+#     document_data_list = []
+#     node_parser = SimpleNodeParser(chunk_size=512, chunk_overlap=20, include_metadata=True)
+#     from llama_index.core import Document
+#     nodes = node_parser.get_nodes_from_documents(documents, show_progress=False )
+#     document_data_list = [
+#         DocumentData(
+#             text=node.text,
+#             metadata=json.loads(json.dumps(node.metadata))  # Assuming metadata is unchanged per chunk
+#         ) for node in nodes
+#     ]
+#     return document_data_list
+# def save_document_data_list_to_file(document_data_list: List[DocumentData], file_path: str):
+#     with open(file_path, 'w', encoding='utf-8') as file:
+#         # Convert list of DocumentData instances to list of dictionaries and then to JSON
+#         json_data = json.dumps([doc.dict() for doc in document_data_list], indent=4)
+#         file.write(json_data)
+# def load_document_data_list_from_file(file_path: str) -> List[DocumentData]:
+#     with open(file_path, 'r', encoding='utf-8') as file:
+#         # Load JSON data and then convert it to list of DocumentData instances
+#         json_data = json.load(file)
+#         document_data_list = parse_obj_as(List[DocumentData], json_data)
+#         return document_data_list
+# file_path_to_save = './document_data_chunked_input.json'
+# document_data_list = load_data()
+# save_document_data_list_to_file(document_data_list, file_path_to_save)
+# document_data_list = load_document_data_list_from_file(file_path_to_save)
+# # Function to process a single document
+# # Modified main loop to accept tqdm progress bar instance
+# def main_loop(q, epoch, output_file, pbar):
+#     while True:
+#         document = q.get()
+#         if document is None:
+#             q.task_done()
+#             break  # Sentinel value to end thread
+#         input_text = document.text
+#         kg = text2kg(text=input_text)
+#         # text2tripleObj = TypedPredictor(Text2Triple)
+#         # triples = text2tripleObj(text=kg.triples).object
+#         pp(kg)
+#         # Save the summary safely to an output file
+#         with open('books-kg.jsonl', "a") as f:
+#             # json.dump({"epoch": epoch, "summary": summary}, f)
+#             # add metadata, text, kg, and summary to the file
+#             json.dump({
+#                 "metadata": document.metadata,
+#                 "text": document.text,
+#                 "kg": kg.triples
+#                 }
+#                 ,
+#             f)
+#             f.write("\n")  # Ensure each summary is on a new line
+#         q.task_done()
+#         pbar.update(1)  # Update progress bar
+#         q.task_done()
+#         pbar.update(1)  # Update progress bar
+# # Function to initiate processing with threading and tqdm
+# def process_documents_with_threads(document_data_list, num_threads=5, output_file="summaries.jsonl"):
+#     q = queue.Queue()
+#     threads = []
+#     # Initialize tqdm progress bar
+#     pbar = tqdm(total=len(document_data_list))
+#     # Start threads
+#     for _ in range(num_threads):
+#         t = threading.Thread(target=main_loop, args=(q, epoch, output_file, pbar))
+#         t.start()
+#         threads.append(t)
+#     # Add documents to queue
+#     for document in document_data_list:
+#         q.put(document)
+#     # Add sentinel values to queue to signal threads to stop
+#     for _ in range(num_threads):
+#         q.put(None)
+#     # Wait for all threads to complete
+#     for t in threads:
+#         t.join()
+#     pbar.close()  # Close progress bar
+# DocumentDataArray = [DocumentData(text=document.text, metadata=document.metadata) for document in document_data_list]
+# epoch = 0
+# # iteration_seconds = 10  # Number of seconds between iterations
+# NUM_THREADS = 18
+# try:
+#     process_documents_with_threads(DocumentDataArray, num_threads=NUM_THREADS, output_file="summaries.jsonl")
+# except KeyboardInterrupt:
+#     print("Processing stopped by user.")

triplets2/k25.py ADDED Viewed

	@@ -0,0 +1,376 @@

+from doctest import Example
+from operator import is_
+from pprint import pp
+from typing import List
+import random
+from marshmallow import missing
+import pandas as pd
+from pydantic import BaseModel, Field
+import dspy
+import dspy
+from dspy.teleprompt import BootstrapFewShot
+from dspy.evaluate import Evaluate, metrics
+from dspy.functional import TypedPredictor
+class KGTriple(BaseModel):
+    # The subject of the triple, representing the entity or concept the triple is about.
+    subject: str = Field(..., description="The subject of the Knowledge Graph triple.")
+    # The predicate of the triple, representing the relationship or attribute of the subject.
+    predicate: str = Field(..., description="The predicate of the Knowledge Graph triple, defining the type of relationship between the subject and object.")
+    # The object of the triple, representing the entity or concept that is related to the subject by the predicate.
+    object: str = Field(..., description="The object of the Knowledge Graph triple, representing the entity or value that the subject is related to via the predicate.")
+class KG(BaseModel):
+    triples: List[KGTriple]
+class Document(BaseModel):
+    text: str
+    page_label: str
+    file_name: str
+    file_type: str
+    file_size: int
+    creation_date: str
+    kg: str
+    summary: str
+    # answer: str
+    # input_text: Optio
+class Text2KG(dspy.Signature):
+    """
+    The task involves analyzing an input text to extract a Knowledge Graph (KG) represented by triples in the format ("subject", "predicate", "object"). Each triple should convey a specific fact or relationship derived from the text.
+    Your goal is to distill complex sentences into fundamental components, focusing on extracting succinct and atomic pieces of information. Instead of using entire sentences as triples, break down the information into the most granular facts possible. This process might involve decomposing a single sentence into multiple triples, each highlighting a distinct piece of information or relationship.
+    Be meticulous in identifying subjects, predicates, and objects, ensuring that each triple accurately reflects the relationships within the text. This extraction task does not require verifying the factual accuracy of the content; your primary objective is to transform the text into a structured format that accurately captures the key points and relationships presented.
+    ### Guidelines:
+    - Identify key entities and actions within the text as subjects and predicates.
+    - Distill complex information into multiple, atomic triples when necessary.
+    - Ensure that triples are independent and self-contained, conveying clear relationships.
+    - Avoid redundancy and ensure that each triple contributes unique information to the KG.
+    - triples format is a long string of ('subject', 'predicate', 'object'), ('subject', 'predicate', 'object'), ...
+    Rules: Predicates should be one word
+    """
+    input_text = dspy.InputField(format=str)
+    triples = dspy.OutputField(desc="('subject', 'predicate', 'object'), ...")
+class KG2Text(dspy.Signature):
+    """
+    Given a set of Knowledge Graph (KG) triples, your task is to organize and present the information from these triples in a detailed, factual, and evidence-based document. The document should closely adhere to the provided triples, logically arranging the information to ensure clarity and factual integrity.
+    Avoid creating a narrative or adding interpretive elements. Instead, focus on structuring the KG's factual content into a coherent, detailed document that directly reflects the relationships and entities within the KG.
+    ### Guidelines:
+    - Present the KG triples in a logically structured manner, maintaining the factual content.
+    - Do not infer or add information not explicitly contained within the KG triples.
+    - Arrange the triples to enhance understanding, grouping related facts where applicable.
+    - Use clear and concise language to ensure the factual basis of the KG is communicated effectively.
+    """
+    input_kg = dspy.InputField(desc="The KG triples to summarize")
+    summary = dspy.OutputField(desc="The synthesized narrative or description")
+class SummaryCritic(dspy.Signature):
+    """Point out what the Summary is missing from the text
+    """
+    input_text = dspy.InputField(desc="The text to generate KG from")
+    kg = dspy.InputField(desc="The generated KG in triples format surrounded by quotes")
+    summary = dspy.InputField(desc="The synthesized narrative or description")
+    summary_critique = dspy.OutputField(desc="The critique of the summary and details of any missing information")
+    summary_rewrite = dspy.OutputField(desc="The rewritten summary with the missing information added")
+    missing_kg = dspy.OutputField(desc='triples in the format ("subject", "predicate", "object")')
+# kg_string = ", ".join(f'("{triple.subject}", "{triple.predicate}", "{triple.object}")' for triple in doc.kg.triples)
+class Text2Triple(dspy.Signature):
+    """Convert triples in format (subject, predicate, object) to JSON"""
+    text: str = dspy.InputField(desc="Text to extract triple pairs") #desc="The first KG in the comparison")
+    # missing_kg: str = dspy.InputField(desc="The second KG")
+    object: List[KGTriple] = dspy.OutputField(desc="")
+    # cot_predictor = TypedPredictor(CodeSignature)
+    # _kg: KGTripleList = cot_predictor(string=example.kg).kg
+class Text2KGModule(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.text2kg = dspy.Predict(Text2KG)
+        self.kg2text = dspy.ChainOfThought(KG2Text)
+        self.critic = dspy.Predict(SummaryCritic)
+        self.code_signature = TypedPredictor(Text2Triple)
+    def forward(self, text):
+        kg = self.text2kg(input_text=text)
+        # text2kg = dspy.Predict(Text2KG)
+        kg2text = dspy.ChainOfThought(KG2Text)
+        critic = dspy.Predict(SummaryCritic)
+        text2tripleObj = TypedPredictor(Text2Triple)
+        summary = kg2text(input_kg=kg.triples).summary
+        critique = critic(input_text=text, kg=kg.triples, summary=summary)
+        missing_kg = critique.missing_kg
+        # # rewritten_summary = critique.summary_rewrite
+        try:
+            triples = text2tripleObj(text=kg.triples).object
+            missing_triples = text2tripleObj(text=missing_kg).object
+            triples_string = ", ".join([f'("{triple.subject}", "{triple.predicate}", "{triple.object}")' for triple in triples])
+            kg.triples = triples_string
+            missing_triples_string = ", ".join([f'("{triple.subject}", "{triple.predicate}", "{triple.object}")' for triple in missing_triples])
+            kg.triples = triples_string + ", " + missing_triples_string
+        except Exception as e:
+            print(f"Not working\nError: {e}")
+            return kg
+        return kg
+        # return self.prog(input_text=text)
+API_BASE = 'https://api.freegpt.today/v1/'
+# MODEL_NAME = requests.get(API_BASE+'models').json()['data'][0]['id']
+# # MODEL_NAME ='mixtral-turbo'
+MODEL_NAME ='gpt-3.5-turbo'
+turbo = dspy.OpenAI(model_type='text',  top_p=1, frequency_penalty=0.2, model=MODEL_NAME, temperature=0.1, max_tokens=6000,  api_base=API_BASE, api_key='asdf', timeout=200)
+dspy.settings.configure(lm=turbo)
+text2kg = Text2KGModule()
+file_name='./optimized_k24-working-42pc.json'
+# text2kg.load(file_name)
+llama3_chat = dspy.OpenAI(
+    model_type='chat',
+    # model_type='chat',
+    # top_p=1, presence_penalty=0.1, temperature=0.1,
+    max_tokens=3000,
+    model='llama3_70b_chat', #'wizardlm', #llama3_70b_chat', #haiku_turbo_chat',
+    api_base='https://api.freegpt.today/v1/',
+    api_key='asdf',
+    timeout=20000)
+with dspy.context(lm=llama3_chat):
+    res = text2kg(text="""
+    The conversation involves two messages:
+    You reached out to Vijay, greeting him and expressing your interest in AI site development and looking for opportunities, indicating your technical background and asking how you can assist him.
+    Vijay responded to you, expressing enthusiasm about your experience and mentioning he is seeking a co-founder. He invited you to connect further to discuss this possibility.
+    """)
+    pp(res)
+# import queue
+# import threading
+# import json
+# import time
+# from llama_index_client import SentenceSplitter
+# from pydantic import BaseModel
+# import json
+# from typing import List
+# import dspy
+# from pydantic import BaseModel, parse_obj_as
+# from llama_index.core import SimpleDirectoryReader
+# from dspy import dsp
+# from tqdm import tqdm
+# class DocumentData(BaseModel):
+#     """# Example data from your output
+#     data_example = {
+#         "uploaded_file_id": "ExampleUploadedFileID",
+#         "text": "Beyond Language Models: Byte Models are Digital World Simulators\nShangda Wu1 2* Xu Tan1* Zili Wang3Rui Wang1Xiaobing Li2Maosong Sun2 4\nhttps://byte-gpt.github.io\nAbstract\nTraditional deep learning often overlooks bytes,\nthe basic units of the digital world, where all\nforms of information and operations are encoded\nand manipulated in binary format. Inspired by\nthe success of next token prediction in natural lan-\nguage processing, we introduce bGPT, a model\nwith next byte prediction to simulate the digi-\ntal world. bGPT matches specialized models in\nperformance across various modalities, including\ntext, audio, and images, and offers new possibil-\nities for predicting, simulating, and diagnosing\nalgorithm or hardware behaviour. It has almost\nflawlessly replicated the process of converting\nsymbolic music data, achieving a low error rate of\n0.0011 bits per byte in converting ABC notation\nto MIDI format. In addition, bGPT demonstrates\nexceptional capabilities in simulating CPU be-\nhaviour, with an accuracy exceeding 99.99% in\nexecuting various operations. Leveraging next\nbyte prediction, models like bGPT can directly\nlearn from vast binary data, effectively simulating\nthe intricate patterns of the digital world.",
+#         "user_id": "ExampleUserID",
+#         "metadata": json.dumps({
+#             "page_label": "1",
+#             "file_name": "Beyond Language Models: Byte Models are Digital World Simulators.pdf",
+#             "file_path": "/home/fullstack/dev/licensed/apps/dspytest/notes/Beyond Language Models: Byte Models are Digital World Simulators.pdf",
+#             "file_type": "application/pdf",
+#             "file_size": 771513,
+#             "creation_date": "2024-03-05",
+#             "last_modified_date": "2024-03-03"
+#         })
+#     }
+#     """
+#     # uploaded_file_id: str
+#     text: str
+#     # user_id: str
+#     metadata: dict  # Since you're storing JSON, let's keep this as a dictMetadata
+# # from llama_index.node_parser import SimpleNodeParser
+# from llama_index.core.node_parser import SimpleNodeParser
+# # Assuming DocumentData is already defined
+# class DocumentData(BaseModel):
+#     text: str
+#     metadata: dict
+# def clean_nul_chars(s):
+#     return s.replace('\x00', '')
+# def load_data(directory="./books", text_splitter_cls=SentenceSplitter, chunk_size=1000, chunk_overlap=0):
+#     documents = SimpleDirectoryReader(input_dir=directory, exclude=[]).load_data()
+#     if len(documents) == 0:
+#         raise Exception("No documents found in the specified directory.")
+#     document_data_list = []
+#     node_parser = SimpleNodeParser(chunk_size=512, chunk_overlap=20, include_metadata=True)
+#     from llama_index.core import Document
+#     nodes = node_parser.get_nodes_from_documents(documents, show_progress=False )
+#     document_data_list = [
+#         DocumentData(
+#             text=node.text,
+#             metadata=json.loads(json.dumps(node.metadata))  # Assuming metadata is unchanged per chunk
+#         ) for node in nodes
+#     ]
+#     return document_data_list
+# def save_document_data_list_to_file(document_data_list: List[DocumentData], file_path: str):
+#     with open(file_path, 'w', encoding='utf-8') as file:
+#         # Convert list of DocumentData instances to list of dictionaries and then to JSON
+#         json_data = json.dumps([doc.dict() for doc in document_data_list], indent=4)
+#         file.write(json_data)
+# def load_document_data_list_from_file(file_path: str) -> List[DocumentData]:
+#     with open(file_path, 'r', encoding='utf-8') as file:
+#         # Load JSON data and then convert it to list of DocumentData instances
+#         json_data = json.load(file)
+#         document_data_list = parse_obj_as(List[DocumentData], json_data)
+#         return document_data_list
+# file_path_to_save = './document_data_chunked_input.json'
+# document_data_list = load_data()
+# save_document_data_list_to_file(document_data_list, file_path_to_save)
+# document_data_list = load_document_data_list_from_file(file_path_to_save)
+# # Function to process a single document
+# # Modified main loop to accept tqdm progress bar instance
+# def main_loop(q, epoch, output_file, pbar):
+#     while True:
+#         document = q.get()
+#         if document is None:
+#             q.task_done()
+#             break  # Sentinel value to end thread
+#         input_text = document.text
+#         kg = text2kg(text=input_text)
+#         # text2tripleObj = TypedPredictor(Text2Triple)
+#         # triples = text2tripleObj(text=kg.triples).object
+#         pp(kg)
+#         # Save the summary safely to an output file
+#         with open('books-kg.jsonl', "a") as f:
+#             # json.dump({"epoch": epoch, "summary": summary}, f)
+#             # add metadata, text, kg, and summary to the file
+#             json.dump({
+#                 "metadata": document.metadata,
+#                 "text": document.text,
+#                 "kg": kg.triples
+#                 }
+#                 ,
+#             f)
+#             f.write("\n")  # Ensure each summary is on a new line
+#         q.task_done()
+#         pbar.update(1)  # Update progress bar
+#         q.task_done()
+#         pbar.update(1)  # Update progress bar
+# # Function to initiate processing with threading and tqdm
+# def process_documents_with_threads(document_data_list, num_threads=5, output_file="summaries.jsonl"):
+#     q = queue.Queue()
+#     threads = []
+#     # Initialize tqdm progress bar
+#     pbar = tqdm(total=len(document_data_list))
+#     # Start threads
+#     for _ in range(num_threads):
+#         t = threading.Thread(target=main_loop, args=(q, epoch, output_file, pbar))
+#         t.start()
+#         threads.append(t)
+#     # Add documents to queue
+#     for document in document_data_list:
+#         q.put(document)
+#     # Add sentinel values to queue to signal threads to stop
+#     for _ in range(num_threads):
+#         q.put(None)
+#     # Wait for all threads to complete
+#     for t in threads:
+#         t.join()
+#     pbar.close()  # Close progress bar
+# DocumentDataArray = [DocumentData(text=document.text, metadata=document.metadata) for document in document_data_list]
+# epoch = 0
+# # iteration_seconds = 10  # Number of seconds between iterations
+# NUM_THREADS = 18
+# try:
+#     process_documents_with_threads(DocumentDataArray, num_threads=NUM_THREADS, output_file="summaries.jsonl")
+# except KeyboardInterrupt:
+#     print("Processing stopped by user.")

triplets2/openai_usage.log ADDED Viewed

File without changes

triplets2/optimized_k24-working-42pc.json ADDED Viewed

The diff for this file is too large to render. See raw diff

triplets2/triples-kg2summary.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from pprint import pp
+from models import document
+import pandas as pd
+import dspy
+from dspy.teleprompt import BootstrapFewShot
+from dspy.evaluate import Evaluate
+from opus.modules.kg2summary import KG2TextModule
+# from utils.dataframe_utils import dataframe_to_documents
+from opus.utils.dataframe_utils import dataframe_to_documents
+from opus.metric.base import AssessPrediction
+turbo = dspy.OpenAI(model='gpt-3.5-turbo', temperature=0.1, max_tokens=1200, top_p=0.89, api_base='http://localhost:6000/v1/', api_key='asdf', timeout=200)
+# turbo = dspy.OpenAI(model='mixtral-turbo', temperature=0.5, max_tokens=1024, api_base='http://localhost:6000/v1/', api_key='asdf')#, timeout=200)
+dspy.settings.configure(lm=turbo)
+NUM_THREADS=10
+from pprint import pprint
+class AssessPrediction(dspy.Signature):
+    """Evaluate and compare two summaries to identify which one more accurately and effectively conveys the key points of the original content.
+    As an expert reviewer, your role is to critically assess the clarity, coherence, and completeness of each summary. Consider the following aspects in your evaluation:
+    - Clarity: Does the summary present the information in a clear and understandable manner?
+    - Coherence: Is the summary logically organized and do the ideas flow smoothly?
+    - Completeness: Does the summary capture the essential points of the original content without omitting crucial information?
+    Based on your analysis, determine which summary does a better job at distilling the essence of the original material, making a sophisticated decision that takes into account not just the factual accuracy but also the readability and overall effectiveness of the summarization.
+    """
+    summary1 = dspy.InputField()
+    summary2 = dspy.InputField()
+    assessment_answer = dspy.OutputField(desc="summary1 or summary2")
+# class AssessPrediction(dspy.Signature):
+#     """Pick the better summary based on the example."""
+#     summary1 = dspy.InputField()
+#     summary2 = dspy.InputField()
+#     assessment_answer = dspy.OutputField(desc="summary1 or summary2")
+import random
+def factuality_metric(gold, pred, trace=None):
+    assess = dspy.ChainOfThought(AssessPrediction)
+    # if pred.summary_rewrite is "N/A" then default to pred.summary
+    _summary = pred.summary_rewrite if pred.summary_rewrite != "N/A" else pred.summary
+    # Initialize summaries with labels before shuffling
+    summaries = [('gold', gold.summary), ('pred', _summary)]
+    # Randomize summaries order and prepare for assessment
+    random.shuffle(summaries)
+    assessment_args = {
+        'summary1': summaries[0][1],  # First summary after shuffle
+        'summary2': summaries[1][1]   # Second summary after shuffle
+    }
+    # Keep track of which summary is which
+    summary1_label = summaries[0][0]
+    summary2_label = summaries[1][0]
+    # Assess using the randomized summaries
+    _winner = assess(**assessment_args)
+    winner_label = _winner.assessment_answer.split()[0].lower()
+    # Determine the winner based on original labels
+    if winner_label == 'summary1':
+        winner_is_gold = summary1_label == 'gold'
+    else:
+        winner_is_gold = summary2_label == 'gold'
+    return winner_is_gold
+def train():
+    df = pd.read_parquet('./data/kg_datasetK17.parquet')
+    print(f"Number of records: {len(df)}")
+    random_sample = df.sample(n=5)
+    print(f"Random sample: {random_sample}")
+    # Setup our bootstrap
+    teleprompter = BootstrapFewShot(metric=factuality_metric, max_bootstrapped_demos=4, max_labeled_demos=16)
+    # Get the documents from the parquet file
+    _documents = dataframe_to_documents(df)
+    pp(_documents[0].text)
+    # exit()
+    documents = []
+    for doc in _documents:
+        # doc.with_inputs('kg')
+        doc = dspy.Example(**doc).with_inputs('kg')
+        documents.append(doc)
+    # random.shuffle(documents)
+    # from pprint import pprint
+    # pprint(documents)
+    # Split documents into train, validation, and test sets
+    split1, split2 = len(documents) // 3, 2 * len(documents) // 3
+    train, validation, test_set = documents[:split1], documents[split1:split2], documents[split2:]
+    train = train[:20]
+    validation = validation[:10]
+    test_set = test_set[:15]
+    optimized_KG2Text = teleprompter.compile(KG2TextModule(), trainset=train, valset=validation)
+    # print(f"optimized_KG2Text: {optimized_KG2Text}")
+    evaluate = Evaluate(devset=test_set, metric=factuality_metric, num_threads=NUM_THREADS, display_progress=True, display_table=0)
+    score, results = evaluate(optimized_KG2Text, return_all_scores=True, return_outputs=True)
+    # turbo.inspect_history(n=99)
+    print(f"Optimized KG2Text Scores: {score}")
+    print(f"Optimized KG2Text Results: {results[0]}")
+    optimized_KG2Text.save('optimized_KG2Text')
+if __name__ == "__main__":
+    train()

triplets2/triples-kg_syngen.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

triplets2/triples-optimized_text2kg ADDED Viewed

The diff for this file is too large to render. See raw diff

triplets2/triples-train.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import pandas as pd
+import dspy
+from dspy.teleprompt import BootstrapFewShot
+from dspy.evaluate import Evaluate
+from opus.modules.text2kg import Text2KGModule
+# from utils.dataframe_utils import dataframe_to_documents
+from opus.utils.dataframe_utils import dataframe_to_documents
+from opus.metric.base import factuality_metric
+turbo = dspy.OpenAI(model='gpt-3.5-turbo', temperature=0.1, max_tokens=1200, top_p=0.89, api_base='http://localhost:6000/v1/', api_key='asdf', timeout=200)
+# turbo = dspy.OpenAI(model='mixtral-turbo', temperature=0.5, max_tokens=1024, api_base='http://localhost:6000/v1/', api_key='asdf')#, timeout=200)
+dspy.settings.configure(lm=turbo)
+NUM_THREADS=10
+def train():
+    df = pd.read_parquet('./data/kg_datasetK17.parquet')
+    print(f"Number of records: {len(df)}")
+    random_sample = df.sample(n=5)
+    print(f"Random sample: {random_sample}")
+    # Setup our bootstrap
+    teleprompter = BootstrapFewShot(metric=factuality_metric, max_bootstrapped_demos=4, max_labeled_demos=16)
+    # Get the documents from the parquet file
+    documents = dataframe_to_documents(df)
+    # random.shuffle(documents)
+    # Split documents into train, validation, and test sets
+    split1, split2 = len(documents) // 3, 2 * len(documents) // 3
+    train, validation, test_set = documents[:split1], documents[split1:split2], documents[split2:]
+    train = train[:10]
+    validation = validation[:10]
+    test_set = test_set[:15]
+    optimized_text2kg = teleprompter.compile(Text2KGModule(), trainset=train, valset=validation)
+    # print(f"optimized_text2kg: {optimized_text2kg}")
+    evaluate = Evaluate(devset=test_set, metric=factuality_metric, num_threads=NUM_THREADS, display_progress=True, display_table=0)
+    score, results = evaluate(optimized_text2kg, return_all_scores=True, return_outputs=True)
+    # turbo.inspect_history(n=99)
+    print(f"Optimized Text2KG Scores: {score}")
+    print(f"Optimized Text2KG Results: {results[0]}")
+    optimized_text2kg.save('optimized_text2kg')
+if __name__ == "__main__":
+    train()