David Elbel commited on
Commit
d50ef42
1 Parent(s): 6af8f75
triplets2/assertion.log DELETED
File without changes
triplets2/azure_openai_usage.log DELETED
@@ -1,113 +0,0 @@
1
- Retrying request to /chat/completions in 0.958111 seconds
2
- Retrying request to /chat/completions in 1.745523 seconds
3
- Retrying request to /chat/completions in 0.820110 seconds
4
- Retrying request to /chat/completions in 1.505952 seconds
5
- HTTP Request: POST https://api.freegpt.today/v1chat/completions "HTTP/1.1 404 Not Found"
6
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
7
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
8
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
9
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
10
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
11
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
12
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
13
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
14
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 500 Internal Server Error"
15
- Retrying request to /completions in 0.947293 seconds
16
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 500 Internal Server Error"
17
- Retrying request to /completions in 1.887936 seconds
18
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 500 Internal Server Error"
19
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
20
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
21
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
22
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
23
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
24
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
25
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
26
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
27
- Retrying request to /completions in 0.814191 seconds
28
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
29
- Retrying request to /completions in 1.747078 seconds
30
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
31
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
32
- Retrying request to /completions in 0.909832 seconds
33
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
34
- Retrying request to /completions in 1.592161 seconds
35
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
36
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
37
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
38
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
39
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 500 Internal Server Error"
40
- Retrying request to /completions in 0.837366 seconds
41
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
42
- Retrying request to /completions in 1.775891 seconds
43
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
44
- Retrying request to /completions in 0.964984 seconds
45
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
46
- Retrying request to /completions in 1.653462 seconds
47
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 502 Bad Gateway"
48
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
49
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
50
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
51
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
52
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
53
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
54
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
55
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
56
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
57
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
58
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
59
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
60
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
61
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
62
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
63
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
64
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
65
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
66
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
67
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
68
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
69
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
70
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
71
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
72
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
73
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
74
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
75
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
76
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
77
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
78
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
79
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
80
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
81
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
82
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
83
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
84
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
85
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
86
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
87
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
88
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
89
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
90
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
91
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
92
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
93
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
94
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
95
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
96
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
97
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
98
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
99
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
100
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
101
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
102
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
103
- HTTP Request: POST https://api.freegpt.today/v1/completions "HTTP/1.1 200 OK"
104
- HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"
105
- HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"
106
- HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"
107
- HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"
108
- HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"
109
- HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"
110
- HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"
111
- HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"
112
- HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"
113
- HTTP Request: POST https://api.freegpt.today/v1/chat/completions "HTTP/1.1 200 OK"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
triplets2/books/SILENT WEAPONS for QUIET WARS.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0565c7ac700ebd2974d3d7d17cb8ad5bcded4ad0527f0b3bfec492d5c3d8017c
3
- size 1979165
 
 
 
 
triplets2/dev/kg_syngen.py DELETED
@@ -1,135 +0,0 @@
1
-
2
- import json
3
- from pprint import pprint
4
- import queue
5
- import random
6
- import threading
7
- # import random
8
- # import pandas as pd
9
- import dspy
10
- from dspy.teleprompt import BootstrapFewShot
11
- from dspy.evaluate import Evaluate
12
- from tqdm import tqdm
13
-
14
-
15
- from opus.modules.text2kg import Text2KGModule
16
- from opus.modules.text2proposition import Text2PropositionModule, PropositionExamples
17
-
18
- # from utils.dataframe_utils import dataframe_to_documents
19
- # from opus.utils.dataframe_utils import dataframe_to_documents
20
- # from opus.metric.base import factuality_metric
21
-
22
-
23
- turbo = dspy.OpenAI(model='gpt-3.5-turbo', temperature=0.1, presence_penalty=0.2, max_tokens=1200, top_p=0.89, api_base='http://localhost:6000/v1/', api_key='asdf', timeout=200)
24
- # turbo = dspy.OpenAI(model='mixtral-turbo', temperature=0.5, max_tokens=1024, api_base='http://localhost:6000/v1/', api_key='asdf')#, timeout=200)
25
-
26
- dspy.settings.configure(lm=turbo)
27
- NUM_THREADS=10
28
-
29
- class AssessPrediction(dspy.Signature):
30
- """Assess wether this is a high quality Proposition extract and that propositions are formated correctly"""
31
-
32
- text = dspy.InputField()
33
- proposition = dspy.InputField()
34
-
35
- assessment_answer = dspy.OutputField(desc="Yes or No")
36
-
37
- program = dspy.ChainOfThought(AssessPrediction)
38
-
39
- def factuality_metric(gold, pred, trace=None):
40
- # print(f"gold: {gold}")
41
- # print(f"pred: {pred}")
42
- # input("Press enter to continue")
43
- # _program_output = dspy. (text=gold.passage, proposition=pred)
44
- _program_output = dspy.predict
45
-
46
- propostions = "(" + ", ".join(f"'{item}'" for item in pred) + ")"
47
-
48
- program = dspy.ChainOfThought(AssessPrediction)
49
- assessment = program(text=gold.passage, proposition=propostions)
50
- score = assessment.assessment_answer.split()[0].lower() == 'yes'
51
- return score
52
-
53
-
54
- from opus.utils.load_documents import load_data
55
-
56
- CoT = Text2PropositionModule()
57
- asessor = dspy.ChainOfThought(AssessPrediction)
58
-
59
-
60
- def process_document(documents):
61
- _response_example_obj = CoT(passage=documents.text)
62
- pprint(_response_example_obj)
63
-
64
- response_example = "(" + ", ".join(f"'{item}'" for item in _response_example_obj) + ")"
65
-
66
- _program_output = asessor(text=documents.text, proposition=response_example)
67
- assessment = _program_output.assessment_answer.lower() == "yes"
68
- pprint(f"assessment {assessment}")
69
-
70
- return _response_example_obj, assessment
71
-
72
- # Function to process a single document
73
- # Modified main loop to accept tqdm progress bar instance
74
- def main_loop(q, output_file, pbar):
75
- while True:
76
- documents = q.get()
77
- if documents is None:
78
- q.task_done()
79
- break # Sentinel value to end thread
80
-
81
- # Call the function in the main loop
82
- _response_example_obj, assessment = process_document(documents)
83
-
84
- with open(output_file, "a") as f:
85
- # json.dump({"epoch": epoch, "summary": summary}, f)
86
- # add metadata, text, kg, and summary to the file
87
- json.dump({
88
- "metadata": documents.metadata,
89
- "text": documents.text,
90
- "kg": _response_example_obj,
91
- "assessment": assessment,
92
- }
93
- ,
94
- f)
95
- f.write("\n") # Ensure each summary is on a new line
96
-
97
- q.task_done()
98
- pbar.update(1) # Update progress bar
99
-
100
- # Function to initiate processing with threading and tqdm
101
- def process_documents_with_threads(document_data_list, num_threads=5, output_file="undefined-syngen.jsonl"):
102
- q = queue.Queue()
103
- threads = []
104
-
105
- # Initialize tqdm progress bar
106
- pbar = tqdm(total=len(document_data_list))
107
-
108
- # Start threads
109
- for _ in range(num_threads):
110
- t = threading.Thread(target=main_loop, args=(q, output_file, pbar))
111
- t.start()
112
- threads.append(t)
113
-
114
- # Add documents to queue
115
- for document in document_data_list:
116
- q.put(document)
117
-
118
- # Add sentinel values to queue to signal threads to stop
119
- for _ in range(num_threads):
120
- q.put(None)
121
-
122
- # Wait for all threads to complete
123
- for t in threads:
124
- t.join()
125
-
126
- pbar.close() # Close progress bar
127
-
128
-
129
- NUM_THREADS = 20
130
- try:
131
- DocumentDataArray = load_data('./data/books')
132
- random.shuffle(DocumentDataArray)
133
- process_documents_with_threads(DocumentDataArray, num_threads=NUM_THREADS, output_file="kg_syngen.jsonl")
134
- except KeyboardInterrupt:
135
- print("Processing stopped by user.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
triplets2/dev/optimized_KG2Text DELETED
The diff for this file is too large to render. See raw diff
 
triplets2/dev/text2prop_syngen.py DELETED
@@ -1,205 +0,0 @@
1
-
2
- import json
3
- from pprint import pprint
4
- import queue
5
- import random
6
- import threading
7
- # import random
8
- # import pandas as pd
9
- import dspy
10
- from dspy.teleprompt import BootstrapFewShot
11
- from dspy.evaluate import Evaluate
12
- from tqdm import tqdm
13
-
14
-
15
- from opus.modules.text2kg import Text2KGModule
16
- from opus.modules.text2proposition import Text2PropositionModule, PropositionExamples
17
-
18
- # from utils.dataframe_utils import dataframe_to_documents
19
- # from opus.utils.dataframe_utils import dataframe_to_documents
20
- # from opus.metric.base import factuality_metric
21
-
22
-
23
- turbo = dspy.OpenAI(model='gpt-3.5-turbo', temperature=0.1, presence_penalty=0.2, max_tokens=1200, top_p=0.89, api_base='http://localhost:6000/v1/', api_key='asdf', timeout=200)
24
- # turbo = dspy.OpenAI(model='mixtral-turbo', temperature=0.5, max_tokens=1024, api_base='http://localhost:6000/v1/', api_key='asdf')#, timeout=200)
25
-
26
- dspy.settings.configure(lm=turbo)
27
- NUM_THREADS=10
28
-
29
- class AssessPrediction(dspy.Signature):
30
- """Assess wether this is a high quality Proposition extract and that propositions are formated correctly"""
31
-
32
- text = dspy.InputField()
33
- proposition = dspy.InputField()
34
-
35
- assessment_answer = dspy.OutputField(desc="Yes or No")
36
-
37
- program = dspy.ChainOfThought(AssessPrediction)
38
-
39
- def factuality_metric(gold, pred, trace=None):
40
- # print(f"gold: {gold}")
41
- # print(f"pred: {pred}")
42
- # input("Press enter to continue")
43
- # _program_output = dspy. (text=gold.passage, proposition=pred)
44
- _program_output = dspy.predict
45
-
46
- propostions = "(" + ", ".join(f"'{item}'" for item in pred) + ")"
47
-
48
- program = dspy.ChainOfThought(AssessPrediction)
49
- assessment = program(text=gold.passage, proposition=propostions)
50
- score = assessment.assessment_answer.split()[0].lower() == 'yes'
51
- return score
52
-
53
-
54
- from opus.utils.load_documents import load_data
55
- # save_document_data_list_to_file, load_document_data_list_from_file
56
- # def train():
57
- # # documents = load_data('./data/books')
58
- # # save_document_data_list_to_file(documents, './data/text2proposition-books.json')
59
- # # documents = load_document_data_list_from_file('./data/text2proposition-books.json')
60
-
61
- # # # Get the documents from the parquet file
62
- # # documents = dataframe_to_documents(df)
63
- # # random.shuffle(documents)
64
-
65
- # # # # Split documents into train, validation, and test sets
66
- # # split1, split2 = len(documents) // 3, 2 * len(documents) // 3
67
- # # train, validation, test_set = documents[:split1], documents[split1:split2], documents[split2:]
68
-
69
-
70
- # # for each string in array _prop, merge all the arrays together and separate t hem by a comman join them thanks
71
- # _PropositionExamples = [dspy.Example(proposition='['+','.join(prop['propositions'])+']', passage=prop['passage']) for prop in PropositionExamples]
72
- # _PropositionExamples = [dspy.Example(**prop).with_inputs('passage') for prop in PropositionExamples]
73
-
74
- # # train = _PropositionExamples[:10]
75
- # # validation = _PropositionExamples[:10]
76
- # # test_set = test_set[:15]
77
- # # train = [
78
- # # # dspy.Example(proposition=','.join(prop['proposition']), passage
79
- # # dspy.Example(proposition="fuck you", passage="hello world").with_inputs("passage")
80
- # # ]
81
- # train = _PropositionExamples[:5]
82
- # validation = _PropositionExamples[5:]
83
- # pprint(train)
84
-
85
- # print("val")
86
- # pprint(validation)
87
-
88
-
89
- # teleprompter = BootstrapFewShot(metric=factuality_metric, max_bootstrapped_demos=3, max_labeled_demos=16)
90
- # optimized_prop = teleprompter.compile(Text2PropositionModule(), trainset=train, valset=validation)
91
-
92
- # # # # print(f"optimized_text2kg: {optimized_text2kg}")
93
-
94
-
95
- # yes_responses = []
96
-
97
-
98
-
99
-
100
-
101
-
102
- # # If you want to see the documents and propositions stored for "YES" responses
103
- # for doc, prop in yes_responses:
104
- # print("\nDocument:", doc.text)
105
- # print("Proposition:", prop)
106
-
107
-
108
- # if __name__ == "__main__":
109
- # train()
110
-
111
- CoT = Text2PropositionModule()
112
- asessor = dspy.ChainOfThought(AssessPrediction)
113
-
114
- # Function to process a single document
115
- # Modified main loop to accept tqdm progress bar instance
116
- def main_loop(q, output_file, pbar):
117
- while True:
118
- documents = q.get()
119
- if documents is None:
120
- q.task_done()
121
- break # Sentinel value to end thread
122
-
123
-
124
- _response_example_obj = CoT(passage=documents.text)
125
- # print(f"Document: {document.text}\nProposition:")
126
- pprint(_response_example_obj)
127
-
128
- # response_example = ', '.join(response_example)
129
- response_example = "(" + ", ".join(f"'{item}'" for item in _response_example_obj) + ")"
130
- # turn response [] array into a , commas separated
131
-
132
- _program_output = asessor(text=documents.text, proposition=response_example)
133
- assessment = _program_output.assessment_answer.lower() == "yes"
134
- pprint(f"assessment {assessment}")
135
-
136
-
137
- # Save the summary safely to an output file
138
- with open(output_file, "a") as f:
139
- # json.dump({"epoch": epoch, "summary": summary}, f)
140
- # add metadata, text, kg, and summary to the file
141
- json.dump({
142
- "metadata": documents.metadata,
143
- "text": documents.text,
144
- "proposition": _response_example_obj,
145
- # 'page_label':
146
- # '2'
147
- # 'file_name':
148
- # 'Property1_Turner_Dec2014.pdf'
149
- # 'file_path':
150
- # '/home/fullstack/dev/licensed/apps/opus/data/books/Property1_Turner_Dec2014.pdf'
151
- # 'file_type':
152
- # 'application/pdf'
153
- # 'file_size':
154
- # 3652131
155
- # 'creation_date':
156
- # '2024-03-12'
157
- # 'last_modified_date':
158
- # '2024-03-12'
159
- # len():
160
- # 7
161
- #
162
- }
163
- ,
164
- f)
165
- f.write("\n") # Ensure each summary is on a new line
166
-
167
- q.task_done()
168
- pbar.update(1) # Update progress bar
169
-
170
- # Function to initiate processing with threading and tqdm
171
- def process_documents_with_threads(document_data_list, num_threads=5, output_file="./output/propositions.jsonl"):
172
- q = queue.Queue()
173
- threads = []
174
-
175
- # Initialize tqdm progress bar
176
- pbar = tqdm(total=len(document_data_list))
177
-
178
- # Start threads
179
- for _ in range(num_threads):
180
- t = threading.Thread(target=main_loop, args=(q, output_file, pbar))
181
- t.start()
182
- threads.append(t)
183
-
184
- # Add documents to queue
185
- for document in document_data_list:
186
- q.put(document)
187
-
188
- # Add sentinel values to queue to signal threads to stop
189
- for _ in range(num_threads):
190
- q.put(None)
191
-
192
- # Wait for all threads to complete
193
- for t in threads:
194
- t.join()
195
-
196
- pbar.close() # Close progress bar
197
-
198
-
199
- NUM_THREADS = 20
200
- try:
201
- DocumentDataArray = load_data('./data/books')
202
- random.shuffle(DocumentDataArray)
203
- process_documents_with_threads(DocumentDataArray, num_threads=NUM_THREADS, output_file="propostions_syngen.jsonl")
204
- except KeyboardInterrupt:
205
- print("Processing stopped by user.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
triplets2/document_data_chunked_input.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d32b9544f2fcffb03b012ccaaa98eba606b534aa011d1c14385f3fc71859699
3
- size 36256148
 
 
 
 
triplets2/k25-infer.py DELETED
@@ -1,353 +0,0 @@
1
- from doctest import Example
2
- from operator import is_
3
- from pprint import pp
4
-
5
- from typing import List
6
- import random
7
- from marshmallow import missing
8
- import pandas as pd
9
- from pydantic import BaseModel, Field
10
- import dspy
11
- import dspy
12
- from dspy.teleprompt import BootstrapFewShot
13
- from dspy.evaluate import Evaluate, metrics
14
- from dspy.functional import TypedPredictor
15
-
16
-
17
-
18
- class KGTriple(BaseModel):
19
- # The subject of the triple, representing the entity or concept the triple is about.
20
- subject: str = Field(..., description="The subject of the Knowledge Graph triple.")
21
-
22
- # The predicate of the triple, representing the relationship or attribute of the subject.
23
- predicate: str = Field(..., description="The predicate of the Knowledge Graph triple, defining the type of relationship between the subject and object.")
24
-
25
- # The object of the triple, representing the entity or concept that is related to the subject by the predicate.
26
- object: str = Field(..., description="The object of the Knowledge Graph triple, representing the entity or value that the subject is related to via the predicate.")
27
-
28
-
29
- class KG(BaseModel):
30
- triples: List[KGTriple]
31
-
32
-
33
- class Document(BaseModel):
34
- text: str
35
- page_label: str
36
- file_name: str
37
- file_type: str
38
- file_size: int
39
- creation_date: str
40
- kg: str
41
- summary: str
42
- # answer: str
43
- # input_text: Optio
44
-
45
-
46
- class Text2KG(dspy.Signature):
47
- """
48
- The task involves analyzing an input text to extract a Knowledge Graph (KG) represented by triples in the format ("subject", "predicate", "object"). Each triple should convey a specific fact or relationship derived from the text.
49
-
50
- Your goal is to distill complex sentences into fundamental components, focusing on extracting succinct and atomic pieces of information. Instead of using entire sentences as triples, break down the information into the most granular facts possible. This process might involve decomposing a single sentence into multiple triples, each highlighting a distinct piece of information or relationship.
51
-
52
- Be meticulous in identifying subjects, predicates, and objects, ensuring that each triple accurately reflects the relationships within the text. This extraction task does not require verifying the factual accuracy of the content; your primary objective is to transform the text into a structured format that accurately captures the key points and relationships presented.
53
-
54
- ### Guidelines:
55
- - Identify key entities and actions within the text as subjects and predicates.
56
- - Distill complex information into multiple, atomic triples when necessary.
57
- - Ensure that triples are independent and self-contained, conveying clear relationships.
58
- - Avoid redundancy and ensure that each triple contributes unique information to the KG.
59
- - triples format is a long string of ('subject', 'predicate', 'object'), ('subject', 'predicate', 'object'), ...
60
- """
61
-
62
- input_text = dspy.InputField()
63
- triples = dspy.OutputField(desc="('subject', 'predicate', 'object'), ...")
64
-
65
- class KG2Text(dspy.Signature):
66
- """
67
- Given a set of Knowledge Graph (KG) triples, your task is to organize and present the information from these triples in a detailed, factual, and evidence-based document. The document should closely adhere to the provided triples, logically arranging the information to ensure clarity and factual integrity.
68
-
69
- Avoid creating a narrative or adding interpretive elements. Instead, focus on structuring the KG's factual content into a coherent, detailed document that directly reflects the relationships and entities within the KG.
70
-
71
- ### Guidelines:
72
- - Present the KG triples in a logically structured manner, maintaining the factual content.
73
- - Do not infer or add information not explicitly contained within the KG triples.
74
- - Arrange the triples to enhance understanding, grouping related facts where applicable.
75
- - Use clear and concise language to ensure the factual basis of the KG is communicated effectively.
76
- """
77
-
78
- input_kg = dspy.InputField(desc="The KG triples to summarize")
79
- summary = dspy.OutputField(desc="The synthesized narrative or description")
80
-
81
- class SummaryCritic(dspy.Signature):
82
- """Point out what the Summary is missing from the text
83
-
84
- """
85
-
86
- input_text = dspy.InputField(desc="The text to generate KG from")
87
- kg = dspy.InputField(desc="The generated KG in triples format surrounded by quotes")
88
- summary = dspy.InputField(desc="The synthesized narrative or description")
89
-
90
- summary_critique = dspy.OutputField(desc="The critique of the summary and details of any missing information")
91
- summary_rewrite = dspy.OutputField(desc="The rewritten summary with the missing information added")
92
- missing_kg = dspy.OutputField(desc='triples in the format ("subject", "predicate", "object")')
93
-
94
-
95
- # kg_string = ", ".join(f'("{triple.subject}", "{triple.predicate}", "{triple.object}")' for triple in doc.kg.triples)
96
-
97
-
98
- class Text2Triple(dspy.Signature):
99
- """Convert"""
100
- text: str = dspy.InputField(desc="Text to extract triple pairs") #desc="The first KG in the comparison")
101
- # missing_kg: str = dspy.InputField(desc="The second KG")
102
- object: List[KGTriple] = dspy.OutputField(desc="triples in the format ('subject', 'predicate', 'object')")
103
-
104
- # cot_predictor = TypedPredictor(CodeSignature)
105
-
106
- # _kg: KGTripleList = cot_predictor(string=example.kg).kg
107
- class Text2KGModule(dspy.Module):
108
- def __init__(self):
109
- super().__init__()
110
- self.text2kg = dspy.Predict(Text2KG)
111
- self.kg2text = dspy.ChainOfThought(KG2Text)
112
- self.critic = dspy.Predict(SummaryCritic)
113
- self.code_signature = TypedPredictor(Text2Triple)
114
-
115
- def forward(self, text):
116
- kg = self.text2kg(input_text=text)
117
- # text2kg = dspy.Predict(Text2KG)
118
-
119
- kg2text = dspy.ChainOfThought(KG2Text)
120
- critic = dspy.Predict(SummaryCritic)
121
- text2tripleObj = TypedPredictor(Text2Triple)
122
-
123
- summary = kg2text(input_kg=kg.triples).summary
124
- critique = critic(input_text=text, kg=kg.triples, summary=summary)
125
-
126
- missing_kg = critique.missing_kg
127
- # # rewritten_summary = critique.summary_rewrite
128
-
129
- try:
130
- triples = text2tripleObj(text=kg.triples).object
131
- missing_triples = text2tripleObj(text=missing_kg).object
132
-
133
-
134
- triples_string = ", ".join([f'("{triple.subject}", "{triple.predicate}", "{triple.object}")' for triple in triples])
135
-
136
- kg.triples = triples_string
137
-
138
- missing_triples_string = ", ".join([f'("{triple.subject}", "{triple.predicate}", "{triple.object}")' for triple in missing_triples])
139
- kg.triples = triples_string + ", " + missing_triples_string
140
- except Exception as e:
141
- return kg
142
-
143
- return kg
144
-
145
- # return self.prog(input_text=text)
146
-
147
-
148
- turbo = dspy.OpenAI(model='gpt-3.5-turbo', temperature=0.1, max_tokens=1200, top_p=0.89, api_base='https://freegpt.today/v1', api_key='asdf', timeout=200)
149
-
150
- dspy.settings.configure(lm=turbo)
151
-
152
-
153
- text2kg = Text2KGModule()
154
-
155
- file_name='./optimized_k24-working-42pc.json'
156
- text2kg.load(file_name)
157
-
158
-
159
- res = text2kg(text="The quick brown fox jumps over the lazy dog")
160
- pp(res)
161
-
162
-
163
-
164
-
165
-
166
-
167
-
168
-
169
-
170
-
171
-
172
-
173
-
174
-
175
-
176
-
177
-
178
-
179
-
180
-
181
-
182
-
183
-
184
-
185
-
186
-
187
- # import queue
188
- # import threading
189
- # import json
190
- # import time
191
- # from llama_index_client import SentenceSplitter
192
- # from pydantic import BaseModel
193
- # import json
194
- # from typing import List
195
- # import dspy
196
- # from pydantic import BaseModel, parse_obj_as
197
- # from llama_index.core import SimpleDirectoryReader
198
-
199
- # from dspy import dsp
200
-
201
- # from tqdm import tqdm
202
-
203
-
204
-
205
- # class DocumentData(BaseModel):
206
- # """# Example data from your output
207
- # data_example = {
208
- # "uploaded_file_id": "ExampleUploadedFileID",
209
- # "text": "Beyond Language Models: Byte Models are Digital World Simulators\nShangda Wu1 2* Xu Tan1* Zili Wang3Rui Wang1Xiaobing Li2Maosong Sun2 4\nhttps://byte-gpt.github.io\nAbstract\nTraditional deep learning often overlooks bytes,\nthe basic units of the digital world, where all\nforms of information and operations are encoded\nand manipulated in binary format. Inspired by\nthe success of next token prediction in natural lan-\nguage processing, we introduce bGPT, a model\nwith next byte prediction to simulate the digi-\ntal world. bGPT matches specialized models in\nperformance across various modalities, including\ntext, audio, and images, and offers new possibil-\nities for predicting, simulating, and diagnosing\nalgorithm or hardware behaviour. It has almost\nflawlessly replicated the process of converting\nsymbolic music data, achieving a low error rate of\n0.0011 bits per byte in converting ABC notation\nto MIDI format. In addition, bGPT demonstrates\nexceptional capabilities in simulating CPU be-\nhaviour, with an accuracy exceeding 99.99% in\nexecuting various operations. Leveraging next\nbyte prediction, models like bGPT can directly\nlearn from vast binary data, effectively simulating\nthe intricate patterns of the digital world.",
210
- # "user_id": "ExampleUserID",
211
- # "metadata": json.dumps({
212
- # "page_label": "1",
213
- # "file_name": "Beyond Language Models: Byte Models are Digital World Simulators.pdf",
214
- # "file_path": "/home/fullstack/dev/licensed/apps/dspytest/notes/Beyond Language Models: Byte Models are Digital World Simulators.pdf",
215
- # "file_type": "application/pdf",
216
- # "file_size": 771513,
217
- # "creation_date": "2024-03-05",
218
- # "last_modified_date": "2024-03-03"
219
- # })
220
- # }
221
- # """
222
- # # uploaded_file_id: str
223
- # text: str
224
- # # user_id: str
225
- # metadata: dict # Since you're storing JSON, let's keep this as a dictMetadata
226
-
227
- # # from llama_index.node_parser import SimpleNodeParser
228
- # from llama_index.core.node_parser import SimpleNodeParser
229
-
230
-
231
- # # Assuming DocumentData is already defined
232
- # class DocumentData(BaseModel):
233
- # text: str
234
- # metadata: dict
235
-
236
- # def clean_nul_chars(s):
237
- # return s.replace('\x00', '')
238
-
239
-
240
- # def load_data(directory="./books", text_splitter_cls=SentenceSplitter, chunk_size=1000, chunk_overlap=0):
241
- # documents = SimpleDirectoryReader(input_dir=directory, exclude=[]).load_data()
242
-
243
- # if len(documents) == 0:
244
- # raise Exception("No documents found in the specified directory.")
245
-
246
- # document_data_list = []
247
-
248
- # node_parser = SimpleNodeParser(chunk_size=512, chunk_overlap=20, include_metadata=True)
249
-
250
- # from llama_index.core import Document
251
- # nodes = node_parser.get_nodes_from_documents(documents, show_progress=False )
252
-
253
- # document_data_list = [
254
- # DocumentData(
255
- # text=node.text,
256
- # metadata=json.loads(json.dumps(node.metadata)) # Assuming metadata is unchanged per chunk
257
- # ) for node in nodes
258
- # ]
259
- # return document_data_list
260
-
261
- # def save_document_data_list_to_file(document_data_list: List[DocumentData], file_path: str):
262
- # with open(file_path, 'w', encoding='utf-8') as file:
263
- # # Convert list of DocumentData instances to list of dictionaries and then to JSON
264
- # json_data = json.dumps([doc.dict() for doc in document_data_list], indent=4)
265
- # file.write(json_data)
266
-
267
- # def load_document_data_list_from_file(file_path: str) -> List[DocumentData]:
268
- # with open(file_path, 'r', encoding='utf-8') as file:
269
- # # Load JSON data and then convert it to list of DocumentData instances
270
- # json_data = json.load(file)
271
- # document_data_list = parse_obj_as(List[DocumentData], json_data)
272
- # return document_data_list
273
-
274
- # file_path_to_save = './document_data_chunked_input.json'
275
- # document_data_list = load_data()
276
- # save_document_data_list_to_file(document_data_list, file_path_to_save)
277
- # document_data_list = load_document_data_list_from_file(file_path_to_save)
278
-
279
-
280
- # # Function to process a single document
281
- # # Modified main loop to accept tqdm progress bar instance
282
- # def main_loop(q, epoch, output_file, pbar):
283
- # while True:
284
- # document = q.get()
285
- # if document is None:
286
- # q.task_done()
287
- # break # Sentinel value to end thread
288
-
289
- # input_text = document.text
290
-
291
- # kg = text2kg(text=input_text)
292
- # # text2tripleObj = TypedPredictor(Text2Triple)
293
- # # triples = text2tripleObj(text=kg.triples).object
294
- # pp(kg)
295
-
296
- # # Save the summary safely to an output file
297
- # with open('books-kg.jsonl', "a") as f:
298
- # # json.dump({"epoch": epoch, "summary": summary}, f)
299
- # # add metadata, text, kg, and summary to the file
300
- # json.dump({
301
- # "metadata": document.metadata,
302
- # "text": document.text,
303
- # "kg": kg.triples
304
- # }
305
- # ,
306
- # f)
307
- # f.write("\n") # Ensure each summary is on a new line
308
-
309
- # q.task_done()
310
- # pbar.update(1) # Update progress bar
311
-
312
-
313
-
314
-
315
- # q.task_done()
316
- # pbar.update(1) # Update progress bar
317
-
318
- # # Function to initiate processing with threading and tqdm
319
- # def process_documents_with_threads(document_data_list, num_threads=5, output_file="summaries.jsonl"):
320
- # q = queue.Queue()
321
- # threads = []
322
-
323
- # # Initialize tqdm progress bar
324
- # pbar = tqdm(total=len(document_data_list))
325
-
326
- # # Start threads
327
- # for _ in range(num_threads):
328
- # t = threading.Thread(target=main_loop, args=(q, epoch, output_file, pbar))
329
- # t.start()
330
- # threads.append(t)
331
-
332
- # # Add documents to queue
333
- # for document in document_data_list:
334
- # q.put(document)
335
-
336
- # # Add sentinel values to queue to signal threads to stop
337
- # for _ in range(num_threads):
338
- # q.put(None)
339
-
340
- # # Wait for all threads to complete
341
- # for t in threads:
342
- # t.join()
343
-
344
- # pbar.close() # Close progress bar
345
-
346
- # DocumentDataArray = [DocumentData(text=document.text, metadata=document.metadata) for document in document_data_list]
347
- # epoch = 0
348
- # # iteration_seconds = 10 # Number of seconds between iterations
349
- # NUM_THREADS = 18
350
- # try:
351
- # process_documents_with_threads(DocumentDataArray, num_threads=NUM_THREADS, output_file="summaries.jsonl")
352
- # except KeyboardInterrupt:
353
- # print("Processing stopped by user.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
triplets2/k25.py DELETED
@@ -1,376 +0,0 @@
1
- from doctest import Example
2
- from operator import is_
3
- from pprint import pp
4
-
5
- from typing import List
6
- import random
7
- from marshmallow import missing
8
- import pandas as pd
9
- from pydantic import BaseModel, Field
10
- import dspy
11
- import dspy
12
- from dspy.teleprompt import BootstrapFewShot
13
- from dspy.evaluate import Evaluate, metrics
14
- from dspy.functional import TypedPredictor
15
-
16
-
17
-
18
- class KGTriple(BaseModel):
19
- # The subject of the triple, representing the entity or concept the triple is about.
20
- subject: str = Field(..., description="The subject of the Knowledge Graph triple.")
21
-
22
- # The predicate of the triple, representing the relationship or attribute of the subject.
23
- predicate: str = Field(..., description="The predicate of the Knowledge Graph triple, defining the type of relationship between the subject and object.")
24
-
25
- # The object of the triple, representing the entity or concept that is related to the subject by the predicate.
26
- object: str = Field(..., description="The object of the Knowledge Graph triple, representing the entity or value that the subject is related to via the predicate.")
27
-
28
-
29
- class KG(BaseModel):
30
- triples: List[KGTriple]
31
-
32
-
33
- class Document(BaseModel):
34
- text: str
35
- page_label: str
36
- file_name: str
37
- file_type: str
38
- file_size: int
39
- creation_date: str
40
- kg: str
41
- summary: str
42
- # answer: str
43
- # input_text: Optio
44
-
45
-
46
- class Text2KG(dspy.Signature):
47
- """
48
- The task involves analyzing an input text to extract a Knowledge Graph (KG) represented by triples in the format ("subject", "predicate", "object"). Each triple should convey a specific fact or relationship derived from the text.
49
-
50
- Your goal is to distill complex sentences into fundamental components, focusing on extracting succinct and atomic pieces of information. Instead of using entire sentences as triples, break down the information into the most granular facts possible. This process might involve decomposing a single sentence into multiple triples, each highlighting a distinct piece of information or relationship.
51
-
52
- Be meticulous in identifying subjects, predicates, and objects, ensuring that each triple accurately reflects the relationships within the text. This extraction task does not require verifying the factual accuracy of the content; your primary objective is to transform the text into a structured format that accurately captures the key points and relationships presented.
53
-
54
- ### Guidelines:
55
- - Identify key entities and actions within the text as subjects and predicates.
56
- - Distill complex information into multiple, atomic triples when necessary.
57
- - Ensure that triples are independent and self-contained, conveying clear relationships.
58
- - Avoid redundancy and ensure that each triple contributes unique information to the KG.
59
- - triples format is a long string of ('subject', 'predicate', 'object'), ('subject', 'predicate', 'object'), ...
60
-
61
- Rules: Predicates should be one word
62
- """
63
-
64
- input_text = dspy.InputField(format=str)
65
- triples = dspy.OutputField(desc="('subject', 'predicate', 'object'), ...")
66
-
67
- class KG2Text(dspy.Signature):
68
- """
69
- Given a set of Knowledge Graph (KG) triples, your task is to organize and present the information from these triples in a detailed, factual, and evidence-based document. The document should closely adhere to the provided triples, logically arranging the information to ensure clarity and factual integrity.
70
-
71
- Avoid creating a narrative or adding interpretive elements. Instead, focus on structuring the KG's factual content into a coherent, detailed document that directly reflects the relationships and entities within the KG.
72
-
73
- ### Guidelines:
74
- - Present the KG triples in a logically structured manner, maintaining the factual content.
75
- - Do not infer or add information not explicitly contained within the KG triples.
76
- - Arrange the triples to enhance understanding, grouping related facts where applicable.
77
- - Use clear and concise language to ensure the factual basis of the KG is communicated effectively.
78
- """
79
-
80
- input_kg = dspy.InputField(desc="The KG triples to summarize")
81
- summary = dspy.OutputField(desc="The synthesized narrative or description")
82
-
83
- class SummaryCritic(dspy.Signature):
84
- """Point out what the Summary is missing from the text
85
-
86
- """
87
-
88
- input_text = dspy.InputField(desc="The text to generate KG from")
89
- kg = dspy.InputField(desc="The generated KG in triples format surrounded by quotes")
90
- summary = dspy.InputField(desc="The synthesized narrative or description")
91
-
92
- summary_critique = dspy.OutputField(desc="The critique of the summary and details of any missing information")
93
- summary_rewrite = dspy.OutputField(desc="The rewritten summary with the missing information added")
94
- missing_kg = dspy.OutputField(desc='triples in the format ("subject", "predicate", "object")')
95
-
96
-
97
- # kg_string = ", ".join(f'("{triple.subject}", "{triple.predicate}", "{triple.object}")' for triple in doc.kg.triples)
98
-
99
-
100
- class Text2Triple(dspy.Signature):
101
- """Convert triples in format (subject, predicate, object) to JSON"""
102
- text: str = dspy.InputField(desc="Text to extract triple pairs") #desc="The first KG in the comparison")
103
- # missing_kg: str = dspy.InputField(desc="The second KG")
104
- object: List[KGTriple] = dspy.OutputField(desc="")
105
-
106
- # cot_predictor = TypedPredictor(CodeSignature)
107
-
108
- # _kg: KGTripleList = cot_predictor(string=example.kg).kg
109
- class Text2KGModule(dspy.Module):
110
- def __init__(self):
111
- super().__init__()
112
- self.text2kg = dspy.Predict(Text2KG)
113
- self.kg2text = dspy.ChainOfThought(KG2Text)
114
- self.critic = dspy.Predict(SummaryCritic)
115
- self.code_signature = TypedPredictor(Text2Triple)
116
-
117
- def forward(self, text):
118
- kg = self.text2kg(input_text=text)
119
- # text2kg = dspy.Predict(Text2KG)
120
-
121
- kg2text = dspy.ChainOfThought(KG2Text)
122
- critic = dspy.Predict(SummaryCritic)
123
- text2tripleObj = TypedPredictor(Text2Triple)
124
-
125
- summary = kg2text(input_kg=kg.triples).summary
126
- critique = critic(input_text=text, kg=kg.triples, summary=summary)
127
-
128
- missing_kg = critique.missing_kg
129
- # # rewritten_summary = critique.summary_rewrite
130
-
131
- try:
132
- triples = text2tripleObj(text=kg.triples).object
133
- missing_triples = text2tripleObj(text=missing_kg).object
134
-
135
-
136
- triples_string = ", ".join([f'("{triple.subject}", "{triple.predicate}", "{triple.object}")' for triple in triples])
137
-
138
- kg.triples = triples_string
139
-
140
- missing_triples_string = ", ".join([f'("{triple.subject}", "{triple.predicate}", "{triple.object}")' for triple in missing_triples])
141
- kg.triples = triples_string + ", " + missing_triples_string
142
- except Exception as e:
143
- print(f"Not working\nError: {e}")
144
- return kg
145
-
146
- return kg
147
-
148
- # return self.prog(input_text=text)
149
-
150
-
151
- API_BASE = 'https://api.freegpt.today/v1/'
152
- # MODEL_NAME = requests.get(API_BASE+'models').json()['data'][0]['id']
153
- # # MODEL_NAME ='mixtral-turbo'
154
- MODEL_NAME ='gpt-3.5-turbo'
155
- turbo = dspy.OpenAI(model_type='text', top_p=1, frequency_penalty=0.2, model=MODEL_NAME, temperature=0.1, max_tokens=6000, api_base=API_BASE, api_key='asdf', timeout=200)
156
- dspy.settings.configure(lm=turbo)
157
-
158
-
159
- text2kg = Text2KGModule()
160
-
161
- file_name='./optimized_k24-working-42pc.json'
162
- # text2kg.load(file_name)
163
-
164
-
165
- llama3_chat = dspy.OpenAI(
166
- model_type='chat',
167
- # model_type='chat',
168
- # top_p=1, presence_penalty=0.1, temperature=0.1,
169
- max_tokens=3000,
170
- model='llama3_70b_chat', #'wizardlm', #llama3_70b_chat', #haiku_turbo_chat',
171
- api_base='https://api.freegpt.today/v1/',
172
- api_key='asdf',
173
- timeout=20000)
174
-
175
-
176
- with dspy.context(lm=llama3_chat):
177
- res = text2kg(text="""
178
- The conversation involves two messages:
179
-
180
- You reached out to Vijay, greeting him and expressing your interest in AI site development and looking for opportunities, indicating your technical background and asking how you can assist him.
181
- Vijay responded to you, expressing enthusiasm about your experience and mentioning he is seeking a co-founder. He invited you to connect further to discuss this possibility.
182
- """)
183
- pp(res)
184
-
185
-
186
-
187
-
188
-
189
-
190
-
191
-
192
-
193
-
194
-
195
-
196
-
197
-
198
-
199
-
200
-
201
-
202
-
203
-
204
-
205
-
206
-
207
-
208
-
209
-
210
- # import queue
211
- # import threading
212
- # import json
213
- # import time
214
- # from llama_index_client import SentenceSplitter
215
- # from pydantic import BaseModel
216
- # import json
217
- # from typing import List
218
- # import dspy
219
- # from pydantic import BaseModel, parse_obj_as
220
- # from llama_index.core import SimpleDirectoryReader
221
-
222
- # from dspy import dsp
223
-
224
- # from tqdm import tqdm
225
-
226
-
227
-
228
- # class DocumentData(BaseModel):
229
- # """# Example data from your output
230
- # data_example = {
231
- # "uploaded_file_id": "ExampleUploadedFileID",
232
- # "text": "Beyond Language Models: Byte Models are Digital World Simulators\nShangda Wu1 2* Xu Tan1* Zili Wang3Rui Wang1Xiaobing Li2Maosong Sun2 4\nhttps://byte-gpt.github.io\nAbstract\nTraditional deep learning often overlooks bytes,\nthe basic units of the digital world, where all\nforms of information and operations are encoded\nand manipulated in binary format. Inspired by\nthe success of next token prediction in natural lan-\nguage processing, we introduce bGPT, a model\nwith next byte prediction to simulate the digi-\ntal world. bGPT matches specialized models in\nperformance across various modalities, including\ntext, audio, and images, and offers new possibil-\nities for predicting, simulating, and diagnosing\nalgorithm or hardware behaviour. It has almost\nflawlessly replicated the process of converting\nsymbolic music data, achieving a low error rate of\n0.0011 bits per byte in converting ABC notation\nto MIDI format. In addition, bGPT demonstrates\nexceptional capabilities in simulating CPU be-\nhaviour, with an accuracy exceeding 99.99% in\nexecuting various operations. Leveraging next\nbyte prediction, models like bGPT can directly\nlearn from vast binary data, effectively simulating\nthe intricate patterns of the digital world.",
233
- # "user_id": "ExampleUserID",
234
- # "metadata": json.dumps({
235
- # "page_label": "1",
236
- # "file_name": "Beyond Language Models: Byte Models are Digital World Simulators.pdf",
237
- # "file_path": "/home/fullstack/dev/licensed/apps/dspytest/notes/Beyond Language Models: Byte Models are Digital World Simulators.pdf",
238
- # "file_type": "application/pdf",
239
- # "file_size": 771513,
240
- # "creation_date": "2024-03-05",
241
- # "last_modified_date": "2024-03-03"
242
- # })
243
- # }
244
- # """
245
- # # uploaded_file_id: str
246
- # text: str
247
- # # user_id: str
248
- # metadata: dict # Since you're storing JSON, let's keep this as a dictMetadata
249
-
250
- # # from llama_index.node_parser import SimpleNodeParser
251
- # from llama_index.core.node_parser import SimpleNodeParser
252
-
253
-
254
- # # Assuming DocumentData is already defined
255
- # class DocumentData(BaseModel):
256
- # text: str
257
- # metadata: dict
258
-
259
- # def clean_nul_chars(s):
260
- # return s.replace('\x00', '')
261
-
262
-
263
- # def load_data(directory="./books", text_splitter_cls=SentenceSplitter, chunk_size=1000, chunk_overlap=0):
264
- # documents = SimpleDirectoryReader(input_dir=directory, exclude=[]).load_data()
265
-
266
- # if len(documents) == 0:
267
- # raise Exception("No documents found in the specified directory.")
268
-
269
- # document_data_list = []
270
-
271
- # node_parser = SimpleNodeParser(chunk_size=512, chunk_overlap=20, include_metadata=True)
272
-
273
- # from llama_index.core import Document
274
- # nodes = node_parser.get_nodes_from_documents(documents, show_progress=False )
275
-
276
- # document_data_list = [
277
- # DocumentData(
278
- # text=node.text,
279
- # metadata=json.loads(json.dumps(node.metadata)) # Assuming metadata is unchanged per chunk
280
- # ) for node in nodes
281
- # ]
282
- # return document_data_list
283
-
284
- # def save_document_data_list_to_file(document_data_list: List[DocumentData], file_path: str):
285
- # with open(file_path, 'w', encoding='utf-8') as file:
286
- # # Convert list of DocumentData instances to list of dictionaries and then to JSON
287
- # json_data = json.dumps([doc.dict() for doc in document_data_list], indent=4)
288
- # file.write(json_data)
289
-
290
- # def load_document_data_list_from_file(file_path: str) -> List[DocumentData]:
291
- # with open(file_path, 'r', encoding='utf-8') as file:
292
- # # Load JSON data and then convert it to list of DocumentData instances
293
- # json_data = json.load(file)
294
- # document_data_list = parse_obj_as(List[DocumentData], json_data)
295
- # return document_data_list
296
-
297
- # file_path_to_save = './document_data_chunked_input.json'
298
- # document_data_list = load_data()
299
- # save_document_data_list_to_file(document_data_list, file_path_to_save)
300
- # document_data_list = load_document_data_list_from_file(file_path_to_save)
301
-
302
-
303
- # # Function to process a single document
304
- # # Modified main loop to accept tqdm progress bar instance
305
- # def main_loop(q, epoch, output_file, pbar):
306
- # while True:
307
- # document = q.get()
308
- # if document is None:
309
- # q.task_done()
310
- # break # Sentinel value to end thread
311
-
312
- # input_text = document.text
313
-
314
- # kg = text2kg(text=input_text)
315
- # # text2tripleObj = TypedPredictor(Text2Triple)
316
- # # triples = text2tripleObj(text=kg.triples).object
317
- # pp(kg)
318
-
319
- # # Save the summary safely to an output file
320
- # with open('books-kg.jsonl', "a") as f:
321
- # # json.dump({"epoch": epoch, "summary": summary}, f)
322
- # # add metadata, text, kg, and summary to the file
323
- # json.dump({
324
- # "metadata": document.metadata,
325
- # "text": document.text,
326
- # "kg": kg.triples
327
- # }
328
- # ,
329
- # f)
330
- # f.write("\n") # Ensure each summary is on a new line
331
-
332
- # q.task_done()
333
- # pbar.update(1) # Update progress bar
334
-
335
-
336
-
337
-
338
- # q.task_done()
339
- # pbar.update(1) # Update progress bar
340
-
341
- # # Function to initiate processing with threading and tqdm
342
- # def process_documents_with_threads(document_data_list, num_threads=5, output_file="summaries.jsonl"):
343
- # q = queue.Queue()
344
- # threads = []
345
-
346
- # # Initialize tqdm progress bar
347
- # pbar = tqdm(total=len(document_data_list))
348
-
349
- # # Start threads
350
- # for _ in range(num_threads):
351
- # t = threading.Thread(target=main_loop, args=(q, epoch, output_file, pbar))
352
- # t.start()
353
- # threads.append(t)
354
-
355
- # # Add documents to queue
356
- # for document in document_data_list:
357
- # q.put(document)
358
-
359
- # # Add sentinel values to queue to signal threads to stop
360
- # for _ in range(num_threads):
361
- # q.put(None)
362
-
363
- # # Wait for all threads to complete
364
- # for t in threads:
365
- # t.join()
366
-
367
- # pbar.close() # Close progress bar
368
-
369
- # DocumentDataArray = [DocumentData(text=document.text, metadata=document.metadata) for document in document_data_list]
370
- # epoch = 0
371
- # # iteration_seconds = 10 # Number of seconds between iterations
372
- # NUM_THREADS = 18
373
- # try:
374
- # process_documents_with_threads(DocumentDataArray, num_threads=NUM_THREADS, output_file="summaries.jsonl")
375
- # except KeyboardInterrupt:
376
- # print("Processing stopped by user.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
triplets2/openai_usage.log DELETED
File without changes
triplets2/optimized_k24-working-42pc.json DELETED
The diff for this file is too large to render. See raw diff
 
triplets2/triples-kg2summary.py DELETED
@@ -1,132 +0,0 @@
1
-
2
- from pprint import pp
3
- from models import document
4
- import pandas as pd
5
- import dspy
6
- from dspy.teleprompt import BootstrapFewShot
7
- from dspy.evaluate import Evaluate
8
-
9
-
10
- from opus.modules.kg2summary import KG2TextModule
11
- # from utils.dataframe_utils import dataframe_to_documents
12
- from opus.utils.dataframe_utils import dataframe_to_documents
13
-
14
- from opus.metric.base import AssessPrediction
15
-
16
-
17
- turbo = dspy.OpenAI(model='gpt-3.5-turbo', temperature=0.1, max_tokens=1200, top_p=0.89, api_base='http://localhost:6000/v1/', api_key='asdf', timeout=200)
18
- # turbo = dspy.OpenAI(model='mixtral-turbo', temperature=0.5, max_tokens=1024, api_base='http://localhost:6000/v1/', api_key='asdf')#, timeout=200)
19
-
20
- dspy.settings.configure(lm=turbo)
21
- NUM_THREADS=10
22
-
23
- from pprint import pprint
24
-
25
-
26
- class AssessPrediction(dspy.Signature):
27
- """Evaluate and compare two summaries to identify which one more accurately and effectively conveys the key points of the original content.
28
-
29
- As an expert reviewer, your role is to critically assess the clarity, coherence, and completeness of each summary. Consider the following aspects in your evaluation:
30
-
31
- - Clarity: Does the summary present the information in a clear and understandable manner?
32
- - Coherence: Is the summary logically organized and do the ideas flow smoothly?
33
- - Completeness: Does the summary capture the essential points of the original content without omitting crucial information?
34
-
35
- Based on your analysis, determine which summary does a better job at distilling the essence of the original material, making a sophisticated decision that takes into account not just the factual accuracy but also the readability and overall effectiveness of the summarization.
36
- """
37
- summary1 = dspy.InputField()
38
- summary2 = dspy.InputField()
39
-
40
- assessment_answer = dspy.OutputField(desc="summary1 or summary2")
41
-
42
- # class AssessPrediction(dspy.Signature):
43
- # """Pick the better summary based on the example."""
44
- # summary1 = dspy.InputField()
45
- # summary2 = dspy.InputField()
46
-
47
- # assessment_answer = dspy.OutputField(desc="summary1 or summary2")
48
-
49
- import random
50
-
51
-
52
- def factuality_metric(gold, pred, trace=None):
53
- assess = dspy.ChainOfThought(AssessPrediction)
54
-
55
- # if pred.summary_rewrite is "N/A" then default to pred.summary
56
- _summary = pred.summary_rewrite if pred.summary_rewrite != "N/A" else pred.summary
57
-
58
- # Initialize summaries with labels before shuffling
59
- summaries = [('gold', gold.summary), ('pred', _summary)]
60
-
61
- # Randomize summaries order and prepare for assessment
62
- random.shuffle(summaries)
63
- assessment_args = {
64
- 'summary1': summaries[0][1], # First summary after shuffle
65
- 'summary2': summaries[1][1] # Second summary after shuffle
66
- }
67
-
68
- # Keep track of which summary is which
69
- summary1_label = summaries[0][0]
70
- summary2_label = summaries[1][0]
71
-
72
- # Assess using the randomized summaries
73
- _winner = assess(**assessment_args)
74
- winner_label = _winner.assessment_answer.split()[0].lower()
75
-
76
- # Determine the winner based on original labels
77
- if winner_label == 'summary1':
78
- winner_is_gold = summary1_label == 'gold'
79
- else:
80
- winner_is_gold = summary2_label == 'gold'
81
-
82
- return winner_is_gold
83
-
84
-
85
- def train():
86
- df = pd.read_parquet('./data/kg_datasetK17.parquet')
87
- print(f"Number of records: {len(df)}")
88
- random_sample = df.sample(n=5)
89
- print(f"Random sample: {random_sample}")
90
-
91
- # Setup our bootstrap
92
- teleprompter = BootstrapFewShot(metric=factuality_metric, max_bootstrapped_demos=4, max_labeled_demos=16)
93
- # Get the documents from the parquet file
94
- _documents = dataframe_to_documents(df)
95
-
96
- pp(_documents[0].text)
97
- # exit()
98
-
99
- documents = []
100
- for doc in _documents:
101
- # doc.with_inputs('kg')
102
- doc = dspy.Example(**doc).with_inputs('kg')
103
- documents.append(doc)
104
- # random.shuffle(documents)
105
-
106
- # from pprint import pprint
107
- # pprint(documents)
108
-
109
- # Split documents into train, validation, and test sets
110
- split1, split2 = len(documents) // 3, 2 * len(documents) // 3
111
- train, validation, test_set = documents[:split1], documents[split1:split2], documents[split2:]
112
-
113
- train = train[:20]
114
- validation = validation[:10]
115
- test_set = test_set[:15]
116
- optimized_KG2Text = teleprompter.compile(KG2TextModule(), trainset=train, valset=validation)
117
-
118
- # print(f"optimized_KG2Text: {optimized_KG2Text}")
119
-
120
- evaluate = Evaluate(devset=test_set, metric=factuality_metric, num_threads=NUM_THREADS, display_progress=True, display_table=0)
121
-
122
- score, results = evaluate(optimized_KG2Text, return_all_scores=True, return_outputs=True)
123
-
124
- # turbo.inspect_history(n=99)
125
-
126
- print(f"Optimized KG2Text Scores: {score}")
127
- print(f"Optimized KG2Text Results: {results[0]}")
128
- optimized_KG2Text.save('optimized_KG2Text')
129
-
130
- if __name__ == "__main__":
131
- train()
132
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
triplets2/triples-kg_syngen.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
triplets2/triples-optimized_text2kg DELETED
The diff for this file is too large to render. See raw diff
 
triplets2/triples-train.py DELETED
@@ -1,55 +0,0 @@
1
-
2
- import pandas as pd
3
- import dspy
4
- from dspy.teleprompt import BootstrapFewShot
5
- from dspy.evaluate import Evaluate
6
-
7
-
8
- from opus.modules.text2kg import Text2KGModule
9
- # from utils.dataframe_utils import dataframe_to_documents
10
- from opus.utils.dataframe_utils import dataframe_to_documents
11
- from opus.metric.base import factuality_metric
12
-
13
-
14
- turbo = dspy.OpenAI(model='gpt-3.5-turbo', temperature=0.1, max_tokens=1200, top_p=0.89, api_base='http://localhost:6000/v1/', api_key='asdf', timeout=200)
15
- # turbo = dspy.OpenAI(model='mixtral-turbo', temperature=0.5, max_tokens=1024, api_base='http://localhost:6000/v1/', api_key='asdf')#, timeout=200)
16
-
17
- dspy.settings.configure(lm=turbo)
18
- NUM_THREADS=10
19
-
20
- def train():
21
- df = pd.read_parquet('./data/kg_datasetK17.parquet')
22
- print(f"Number of records: {len(df)}")
23
- random_sample = df.sample(n=5)
24
- print(f"Random sample: {random_sample}")
25
-
26
- # Setup our bootstrap
27
- teleprompter = BootstrapFewShot(metric=factuality_metric, max_bootstrapped_demos=4, max_labeled_demos=16)
28
- # Get the documents from the parquet file
29
- documents = dataframe_to_documents(df)
30
- # random.shuffle(documents)
31
-
32
- # Split documents into train, validation, and test sets
33
- split1, split2 = len(documents) // 3, 2 * len(documents) // 3
34
- train, validation, test_set = documents[:split1], documents[split1:split2], documents[split2:]
35
-
36
- train = train[:10]
37
- validation = validation[:10]
38
- test_set = test_set[:15]
39
- optimized_text2kg = teleprompter.compile(Text2KGModule(), trainset=train, valset=validation)
40
-
41
- # print(f"optimized_text2kg: {optimized_text2kg}")
42
-
43
- evaluate = Evaluate(devset=test_set, metric=factuality_metric, num_threads=NUM_THREADS, display_progress=True, display_table=0)
44
-
45
- score, results = evaluate(optimized_text2kg, return_all_scores=True, return_outputs=True)
46
-
47
- # turbo.inspect_history(n=99)
48
-
49
- print(f"Optimized Text2KG Scores: {score}")
50
- print(f"Optimized Text2KG Results: {results[0]}")
51
- optimized_text2kg.save('optimized_text2kg')
52
-
53
- if __name__ == "__main__":
54
- train()
55
-