jfrery-zama commited on
Commit
df6182e
·
1 Parent(s): 1fa0767

space working with chatgpt 4

Browse files
.gitignore CHANGED
@@ -1 +1,2 @@
1
- __pycache__/
 
 
1
+ __pycache__/
2
+ .venv/
anonymize_file_clear.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import re
4
+ import uuid
5
+ from pathlib import Path
6
+ import gensim
7
+ from concrete.ml.common.serialization.loaders import load
8
+
9
+ def load_models():
10
+ base_dir = Path(__file__).parent
11
+ embeddings_model = gensim.models.FastText.load(str(base_dir / "embedded_model.model"))
12
+ with open(base_dir / "cml_xgboost.model", "r") as model_file:
13
+ fhe_ner_detection = load(file=model_file)
14
+ return embeddings_model, fhe_ner_detection
15
+
16
+ def anonymize_text(text, embeddings_model, fhe_ner_detection):
17
+ token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
18
+ tokens = re.findall(token_pattern, text)
19
+ uuid_map = {}
20
+ processed_tokens = []
21
+
22
+ for token in tokens:
23
+ if token.strip() and re.match(r"\w+", token): # If the token is a word
24
+ x = embeddings_model.wv[token][None]
25
+ prediction_proba = fhe_ner_detection.predict_proba(x)
26
+ probability = prediction_proba[0][1]
27
+ prediction = probability >= 0.5
28
+ if prediction:
29
+ if token not in uuid_map:
30
+ uuid_map[token] = str(uuid.uuid4())[:8]
31
+ processed_tokens.append(uuid_map[token])
32
+ else:
33
+ processed_tokens.append(token)
34
+ else:
35
+ processed_tokens.append(token) # Preserve punctuation and spaces as is
36
+
37
+ return uuid_map
38
+
39
+ def main():
40
+ parser = argparse.ArgumentParser(description="Anonymize named entities in a text file and save the mapping to a JSON file.")
41
+ parser.add_argument("file_path", type=str, help="The path to the file to be processed.")
42
+ args = parser.parse_args()
43
+
44
+ embeddings_model, fhe_ner_detection = load_models()
45
+
46
+ # Read the input file
47
+ with open(args.file_path, 'r', encoding='utf-8') as file:
48
+ text = file.read()
49
+
50
+ # Anonymize the text
51
+ uuid_map = anonymize_text(text, embeddings_model, fhe_ner_detection)
52
+
53
+ # Save the UUID mapping to a JSON file
54
+ mapping_path = Path(args.file_path).stem + "_uuid_mapping.json"
55
+ with open(mapping_path, 'w', encoding='utf-8') as file:
56
+ json.dump(uuid_map, file, indent=4, sort_keys=True)
57
+
58
+ print(f"UUID mapping saved to {mapping_path}")
59
+
60
+ if __name__ == "__main__":
61
+ main()
app.py CHANGED
@@ -1,13 +1,19 @@
1
  """A Gradio app for anonymizing text data using FHE."""
2
 
3
  import gradio as gr
4
- import re
5
  from fhe_anonymizer import FHEAnonymizer
6
  import pandas as pd
7
-
 
 
 
8
 
9
  anonymizer = FHEAnonymizer()
10
 
 
 
 
 
11
 
12
  def deidentify_text(input_text):
13
  anonymized_text, identified_words_with_prob = anonymizer(input_text)
@@ -22,10 +28,62 @@ def deidentify_text(input_text):
22
  return anonymized_text, identified_df
23
 
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  # Default demo text from the file
26
  with open("demo_text.txt", "r") as file:
27
  default_demo_text = file.read()
28
 
 
 
 
 
 
 
29
  demo = gr.Blocks()
30
 
31
  with demo:
@@ -50,33 +108,23 @@ with demo:
50
  with gr.Accordion("What is Encrypted Anonymization?", open=False):
51
  gr.Markdown(
52
  """
53
- Encrypted Anonymization leverages Fully Homomorphic Encryption (FHE) to protect sensitive information during data processing. This approach allows for the anonymization of text data, such as personal identifiers, while ensuring that the data remains encrypted throughout the entire process. It enables organizations to utilize sensitive data for analytics and machine learning without compromising individual privacy or security.
54
  """
55
  )
56
 
57
- with gr.Accordion("Why is privacy important in data processing?", open=False):
58
- gr.Markdown(
59
- """
60
- Privacy in data processing is critical to protect individuals' personal information from unauthorized access and potential misuse. With the increasing amount of personal data being collected and analyzed, the risks associated with data breaches and identity theft have also risen. By implementing privacy-preserving techniques, such as encrypted anonymization, organizations can safeguard sensitive information, build trust with their customers, and comply with stringent data protection regulations.
61
- """
62
- )
63
 
64
- with gr.Accordion(
65
- "How does Fully Homomorphic Encryption enhance data privacy?", open=False
66
- ):
67
- gr.Markdown(
68
- """
69
- Fully Homomorphic Encryption (FHE) enhances data privacy by enabling computations on encrypted data without needing to decrypt it first. This revolutionary technology ensures that sensitive data can be processed and analyzed securely, without exposing it to potential threats. FHE is a game-changer for privacy-preserving computations, allowing for the secure analysis of encrypted data, which is particularly beneficial in sectors like finance, healthcare, and beyond.
70
- """
71
- )
72
 
73
- gr.Markdown(
74
- """
75
- <p align="center">
76
- <img src="file/images/banner.png">
77
- </p>
78
- """
79
- )
80
 
81
  with gr.Row():
82
  input_text = gr.Textbox(
@@ -88,7 +136,7 @@ with demo:
88
 
89
  anonymized_text_output = gr.Textbox(label="Anonymized Text", lines=13)
90
 
91
- identified_words_output = gr.Dataframe(label="Identified Words")
92
 
93
  submit_button = gr.Button("Anonymize")
94
 
@@ -98,6 +146,16 @@ with demo:
98
  outputs=[anonymized_text_output, identified_words_output],
99
  )
100
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  # Launch the app
103
  demo.launch(share=False)
 
1
  """A Gradio app for anonymizing text data using FHE."""
2
 
3
  import gradio as gr
 
4
  from fhe_anonymizer import FHEAnonymizer
5
  import pandas as pd
6
+ from openai import OpenAI
7
+ import os
8
+ import json
9
+ import re
10
 
11
  anonymizer = FHEAnonymizer()
12
 
13
+ client = OpenAI(
14
+ api_key=os.environ.get("openaikey"),
15
+ )
16
+
17
 
18
  def deidentify_text(input_text):
19
  anonymized_text, identified_words_with_prob = anonymizer(input_text)
 
28
  return anonymized_text, identified_df
29
 
30
 
31
+ def query_chatgpt(anonymized_query):
32
+
33
+ with open("files/anonymized_document.txt", "r") as file:
34
+ anonymized_document = file.read()
35
+ with open("files/chatgpt_prompt.txt", "r") as file:
36
+ prompt = file.read()
37
+
38
+ # Prepare prompt
39
+ full_prompt = (
40
+ prompt + "\n"
41
+ )
42
+ query = "Document content:\n```\n" + anonymized_document + "\n\n```" + "Query:\n```\n" + anonymized_query + "\n```"
43
+ print(full_prompt)
44
+
45
+ completion = client.chat.completions.create(
46
+ model="gpt-4-1106-preview", # Replace with "gpt-4" if available
47
+ messages=[
48
+ {"role": "system", "content": prompt},
49
+ {"role": "user", "content": query},
50
+ ],
51
+ )
52
+ anonymized_response = completion.choices[0].message.content
53
+ with open("original_document_uuid_mapping.json", "r") as file:
54
+ uuid_map = json.load(file)
55
+ inverse_uuid_map = {v: k for k, v in uuid_map.items()} # TODO load the inverse mapping from disk for efficiency
56
+
57
+ # Pattern to identify words and non-words (including punctuation, spaces, etc.)
58
+ token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
59
+ tokens = re.findall(token_pattern, anonymized_response)
60
+ processed_tokens = []
61
+
62
+ print(tokens)
63
+ for token in tokens:
64
+ # Directly append non-word tokens or whitespace to processed_tokens
65
+ if not token.strip() or not re.match(r"\w+", token):
66
+ processed_tokens.append(token)
67
+ continue
68
+ print(token)
69
+ if token in inverse_uuid_map:
70
+ processed_tokens.append(inverse_uuid_map[token])
71
+ else:
72
+ processed_tokens.append(token)
73
+ deanonymized_response = "".join(processed_tokens)
74
+ return anonymized_response, deanonymized_response
75
+
76
+
77
  # Default demo text from the file
78
  with open("demo_text.txt", "r") as file:
79
  default_demo_text = file.read()
80
 
81
+ with open("files/original_document.txt", "r") as file:
82
+ original_document = file.read()
83
+
84
+ with open("files/anonymized_document.txt", "r") as file:
85
+ anonymized_document = file.read()
86
+
87
  demo = gr.Blocks()
88
 
89
  with demo:
 
108
  with gr.Accordion("What is Encrypted Anonymization?", open=False):
109
  gr.Markdown(
110
  """
111
+ Encrypted Anonymization leverages Fully Homomorphic Encryption (FHE) to protect sensitive information during data processing. This approach allows for the anonymization of text data, such as personal identifiers, while ensuring that the data remains encrypted throughout the entire process.
112
  """
113
  )
114
 
115
+ with gr.Accordion("Original Document", open=False):
116
+ gr.Markdown(original_document)
 
 
 
 
117
 
118
+ with gr.Accordion("Anonymized Document", open=False):
119
+ gr.Markdown(anonymized_document)
 
 
 
 
 
 
120
 
121
+ # gr.Markdown(
122
+ # """
123
+ # <p align="center">
124
+ # <img src="file/images/banner.png">
125
+ # </p>
126
+ # """
127
+ # )
128
 
129
  with gr.Row():
130
  input_text = gr.Textbox(
 
136
 
137
  anonymized_text_output = gr.Textbox(label="Anonymized Text", lines=13)
138
 
139
+ identified_words_output = gr.Dataframe(label="Identified Words", visible=False)
140
 
141
  submit_button = gr.Button("Anonymize")
142
 
 
146
  outputs=[anonymized_text_output, identified_words_output],
147
  )
148
 
149
+ with gr.Row():
150
+ chatgpt_response_anonymized = gr.Textbox(label="ChatGPT Anonymized Response", lines=13)
151
+ chatgpt_response_deanonymized = gr.Textbox(label="ChatGPT Deanonymized Response", lines=13)
152
+
153
+ chatgpt_button = gr.Button("Query ChatGPT")
154
+ chatgpt_button.click(
155
+ query_chatgpt,
156
+ inputs=[anonymized_text_output],
157
+ outputs=[chatgpt_response_anonymized, chatgpt_response_deanonymized],
158
+ )
159
 
160
  # Launch the app
161
  demo.launch(share=False)
demo_text.txt CHANGED
@@ -1,10 +1 @@
1
- Hello, my name is David Johnson and I live in Maine.
2
- My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
3
-
4
- On September 18 I visited microsoft.com and sent an email to [email protected], from the IP 192.168.0.1.
5
-
6
- My passport: 191280342 and my phone number: (212) 555-1234.
7
-
8
- This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
9
-
10
- Kate's social security number is 078-05-1126. Her driver license? it is 1234567A.
 
1
+ Who lives in Maine?
 
 
 
 
 
 
 
 
 
fhe_anonymizer.py CHANGED
@@ -3,6 +3,8 @@ import re
3
  from concrete.ml.deployment import FHEModelClient, FHEModelServer
4
  from pathlib import Path
5
  from concrete.ml.common.serialization.loaders import load
 
 
6
 
7
  base_dir = Path(__file__).parent
8
 
@@ -17,6 +19,9 @@ class FHEAnonymizer:
17
  with open(base_dir / "cml_xgboost.model", "r") as model_file:
18
  self.fhe_ner_detection = load(file=model_file)
19
 
 
 
 
20
  path_to_model = (base_dir / "deployment").resolve()
21
  self.client = FHEModelClient(path_to_model)
22
  self.server = FHEModelServer(path_to_model)
@@ -30,57 +35,33 @@ class FHEAnonymizer:
30
  return y
31
 
32
  def __call__(self, text: str):
33
- text = self.preprocess_sentences(text)
34
- identified_words_with_prob = [] # tuples of (word, probability)
35
- new_text = []
 
 
 
 
 
 
 
 
 
36
 
37
- for word in text.split():
38
  # Prediction for each word
39
- x = self.embeddings_model.wv[word][None]
40
  prediction_proba = self.fhe_ner_detection.predict_proba(x)
41
- # prediction = self.fhe_inference(x).argmax(1)[0]
42
- # print(word, prediction)
43
  probability = prediction_proba[0][1]
44
- prediction = probability >= 0.5
45
- if prediction == 1:
46
- identified_words_with_prob.append((word, probability))
47
- new_text.append("<REMOVED>")
48
- else:
49
- new_text.append(word)
50
-
51
- # Joining the modified text
52
- modified_text = " ".join(new_text)
53
-
54
- return modified_text, identified_words_with_prob
55
-
56
- def preprocess_sentences(self, sentence, verbose=False):
57
- """Preprocess the sentence."""
58
-
59
- sentence = re.sub(r"\n+", " ", sentence)
60
- if verbose:
61
- print(sentence)
62
 
63
- sentence = re.sub(" +", " ", sentence)
64
- if verbose:
65
- print(sentence)
66
-
67
- sentence = re.sub(r"'s\b", " s", sentence)
68
- if verbose:
69
- print(sentence)
70
-
71
- sentence = re.sub(r"\s([,.!?;:])", r"\1", sentence)
72
- if verbose:
73
- print(sentence)
74
-
75
- pattern = r"(?<!\w)[{}]|[{}](?!\w)".format(
76
- re.escape(self.punctuation_list), re.escape(self.punctuation_list)
77
- )
78
- sentence = re.sub(pattern, "", sentence)
79
- if verbose:
80
- print(sentence)
81
-
82
- sentence = re.sub(r"\s([,.!?;:])", r"\1", sentence)
83
- if verbose:
84
- print(sentence)
85
 
86
- return sentence
 
 
 
3
  from concrete.ml.deployment import FHEModelClient, FHEModelServer
4
  from pathlib import Path
5
  from concrete.ml.common.serialization.loaders import load
6
+ import uuid
7
+ import json
8
 
9
  base_dir = Path(__file__).parent
10
 
 
19
  with open(base_dir / "cml_xgboost.model", "r") as model_file:
20
  self.fhe_ner_detection = load(file=model_file)
21
 
22
+ with open(base_dir / "original_document_uuid_mapping.json", 'r') as file:
23
+ self.uuid_map = json.load(file)
24
+
25
  path_to_model = (base_dir / "deployment").resolve()
26
  self.client = FHEModelClient(path_to_model)
27
  self.server = FHEModelServer(path_to_model)
 
35
  return y
36
 
37
  def __call__(self, text: str):
38
+ # Pattern to identify words and non-words (including punctuation, spaces, etc.)
39
+ token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
40
+ tokens = re.findall(token_pattern, text)
41
+ identified_words_with_prob = []
42
+ processed_tokens = []
43
+
44
+ print(tokens)
45
+ for token in tokens:
46
+ # Directly append non-word tokens or whitespace to processed_tokens
47
+ if not token.strip() or not re.match(r"\w+", token):
48
+ processed_tokens.append(token)
49
+ continue
50
 
 
51
  # Prediction for each word
52
+ x = self.embeddings_model.wv[token][None]
53
  prediction_proba = self.fhe_ner_detection.predict_proba(x)
 
 
54
  probability = prediction_proba[0][1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ if probability >= 0.5:
57
+ identified_words_with_prob.append((token, probability))
58
+ # Use the existing UUID if available, otherwise generate a new one
59
+ tmp_uuid = self.uuid_map.get(token, str(uuid.uuid4())[:8])
60
+ processed_tokens.append(tmp_uuid)
61
+ self.uuid_map[token] = tmp_uuid
62
+ else:
63
+ processed_tokens.append(token)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ # Reconstruct the sentence
66
+ reconstructed_sentence = ''.join(processed_tokens)
67
+ return reconstructed_sentence, identified_words_with_prob
files/anonymized_document.txt CHANGED
@@ -1,54 +1,10 @@
1
- ca7a17e0 297a7a2c and 47bae4dd f1118b85 for "207c2210 b0572c9c."
2
- 6bd00984 15642039
3
 
4
- 5437eb6f document 259323e3 a 31ab92f5 8eaa0b83 cf94131c and 17f2c576 2a2c7407 plan for cd947192 0540cbbb., a leading 1d557b0d 6d25d852 2fa3434e in developing 65a5ebb9 bd3fdeea for the digital market. 6b80462c document e218c683 3d0b62ab data and ee796684 74629446 that are based on 3d59250b scenarios and dcf1fdbe b0ea476c, ensuring no real 2361ac41 or 857b9636 1bdb9e71 data are 4d87a9cd.
5
- b8859a2c 553cc220
6
 
7
- 58e3af47 171d6281., 8033ec9c in 8b700cc6 by 1e307c1e c7c0231f fd67020c and f6e600b4 41ad12d1, has d0a0127a rapid growth in the daf3dc5f 048e01b9, focusing on areas aa85b69f as 4c23ac66 76bb2672, cloud computing, and e16dd9ad 516c4333. c8ddf3e1 033d58f5 b907bfb3 in 5742554b 2a08bcbe and 2a503bc4 across the 5b5218fd, the b15d6bc2 ae901e0a to 30d358b7 how 14bd9782 0e9ce93c in the digital 3bbeab83.
8
- a28dbb08 9f84ce23
9
- 90112484 5c1705ba
10
 
11
- a9c81aad 819cc7fe. has bbde0c42 53aefe16 revenue through various a48d5930, including software 91daf2e0, 78064bc8 services, and ac852356 sales. In the 478060c5 e9a560ef d59684ea, the 12b4539d reported the 7b3f3216 eee94704 figures:
12
 
13
- 63d7b6bc Subscriptions: 6090defb million
14
- 4cd9e789 a1f51933: 019ac660 million
15
- 3196fc94 Sales: 50311924 million
16
-
17
- 496b82c2 bdf223ed: da54a11a million
18
- 61307cb8 and f4e35ce3
19
-
20
- 0a5f5ee5 3a0429d8's 904bce1e are ce79e4ff 8d5303a8 with research and 0b4a7271 RD, sales and 0807fdda, and c1d7820d 4c0101b2. c5894e7a a0e3c060 are the 2b81915c figures for the 2042d5df f2ddfe31 6cee2393:
21
-
22
- RD: 2bfddef5 million
23
- Sales and 8e7609fc: 682daa09 million
24
- 1eb46a02 93908764: c4a6681a million
25
-
26
- 9cc85bb0 9b15cd0b: 20c5b8db million
27
-
28
- 30d55f53 ef4684e2 for 274f506b 7153d60a: 92063576 million
29
- 26fa65d5 299f9a34 3e07531f
30
-
31
- To ensure f48418ed growth and 4e52397d, d93931c1 bf8dd379. needs to focus on several key areas. 8b71b2c4 plan b8d6f955 77273f70 eff17325 6e32e65a, using bd178c9b data and 56935895 for 94fe5b9b 5c729e9f.
32
- b56f5f8e into f17b897f f0458470
33
-
34
- fbe11266 the global digital c5bc7803 79c37224, emerging markets present f6ea9661 9c3eaa4d for growth. c1d933f6 plans to 7612d3f9 e64531b7 in e3a6050a bd2a43c4 as 375af421, 5dc6aed3, and 60fd1ca9 7675408f, 2bfce010 to 71336309 revenue by b40729e6 over the 807f7c70 16671a96 933e82e7.
35
- 430afebf e77cc3ff
36
-
37
- cfdd4e98 841ff829 in RD is 206d362e for maintaining a 13bb2340 bbf9e6e6. 9132698f 1c52e182 16d64f13 million 8d9c99ec developing new d1e42c0a 62e93c73 and cloud f6017e35, projecting a 313b7853 8b793da0 in software 8719a9b3 revenue by 1bfef750.
38
- 820123a4 and 12093506
39
-
40
- To 14785122 1ce8ed84 product d21e9b14 and 3cafa2cb 8b2d9344 market 9a9ea085, 3a1c4899 will 2c8df8f2 7bef6ad5 8def87a9 and 8c2453f7. eb9d3366 02bf0ddf has set 99e4cfd8 a fund of a1dbd949 million for 1168967a 180c6196 in 68b2be65 and f3d42218 aa61a94d, expecting to enhance 5c96deb8 offering and drive a ba5f69cf revenue growth.
41
- f7531366 8367b98b
42
-
43
- Identifying and a1c193cd dd578067 is 45ba7c0c for the d9f1babd of 6b2c4206's growth. 734bf200 60ed7498 16a3f35e:
44
-
45
- a860d0af 911f97f6: 9a030bf5 ce10476f and adaptation to 76f34e5f 703994f9 are b1a68f0d.
46
- 2cb167d2 53ba4fff: edfc1496 ahead in f0426ec0 to meet 3443e779 market 0c1ac39b.
47
- 6c72c52f e2d3bfae: fc5d928d 71902961 to global 1277dd76, 3374e1d9 in new markets.
48
-
49
- d0c05d81
50
-
51
- 20c10c38 document ac328034 a 55d3f414 80a154f5 7b580e92 and 8c5457ab b75d19ad plan for af820f2a 83382bae., using 7dc8aae7 851e65f7 and figures. It is designed to bd7f1439 how the 6f55f229 can fb8a773e the 5f837d8d of the global digital market, 42f6054b ba70b81c for growth while c9a6eef7 potential 2252cf74. cd4c1afa data and scenarios d9d31530 71a5d9f4 are 09d6c906 3bfc963f and 5352f583 6ef4d7c7 be d69e06ae c65365d9 of 828e04dd real 003af3df or b26c90f6.
52
- 7275fe97
53
-
54
- 7ab379b9 3e911aa4 figures, 41df9da7, and scenarios 24ded709 in 765fb783 document are b14afae8 351fbe3c and created for 62b7490d e6f0cc19 only. 7917e19d do 9b4f4c24 50c52f98 real a92adfc3, c25f65bc, or 17e4168d 3224758c data. 0de09cb5 document is 631d727a as a 25114a3c d7470130 in df9c5fed 6ac1a4e7 and 5430f574 planning.
 
1
+ Hello, 0a182475 name is 84e24eb8 0ea0e35c and 9f2963af live in 5d0a593d.
2
+ 6d7a71e0 credit card number is 1155d2a5 and 0a182475 d8e7c7fc c627b841 id is be0eddce.
3
 
4
+ On b926f540 ee1fa38f 9f2963af visited 1e63e774 and sent an email to f70cf334, from the IP 2d66aefe.
 
5
 
6
+ 6d7a71e0 passport: 53b52085 and 0a182475 phone number: 81133b16 08aeb9f5.
 
 
7
 
8
+ 3bbcaf02 is a valid 11e27c41 5a60bcfa 3730179e 97008100: ab18e6bb . Can 773b08cb f4953628 check the 8ff83dc3 on bank account e1a23ada?
9
 
10
+ 7ab7335c's social security number is 11e90148. 67e5e61e driver license? it is c6652117.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
files/chatgpt_prompt.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ Task: Analyze an anonymized document containing sensitive information, where personal details are replaced with unique identifiers (alphanumeric). You're to answer queries based on this document, using these identifiers to reference specific data.
2
+
3
+ Details:
4
+
5
+ - Sensitive information includes: names, locations, credit card numbers, email addresses, IP addresses, passport details, phone numbers, bank accounts, social security numbers, and driver's licenses.
6
+ - Each piece of information is represented by a unique identifier, maintaining privacy while discussing document content.
7
+ - Your role is to interpret the document's anonymized content and accurately respond to queries using the identifiers.
8
+ - Consistency in identifiers is crucial for connecting the text with the queries correctly.
9
+ - You must not discuss the anonymized nature of the text and use the identifiers as if they were real words for a smooth chat with users.
files/original_document.txt CHANGED
@@ -1,54 +1,10 @@
1
- Financial Analysis and Strategy Development for "TechInnovate Inc."
2
- Executive Summary
3
 
4
- This document provides a comprehensive financial analysis and strategic development plan for TechInnovate Inc., a leading technology firm specializing in developing innovative solutions for the digital market. The document contains synthesized data and proposed strategies that are based on hypothetical scenarios and fictional characters, ensuring no real individuals or actual company data are represented.
5
- Company Overview
6
 
7
- TechInnovate Inc., established in 2010 by founders Alex Johnson and Jamie Smith, has seen rapid growth in the technology sector, focusing on areas such as AI-driven analytics, cloud computing, and IoT solutions. With its headquarters in Silicon Valley and offices across the globe, the company aims to revolutionize how businesses operate in the digital age.
8
- Financial Analysis
9
- Revenue Streams
10
 
11
- TechInnovate Inc. has diversified its revenue through various channels, including software subscriptions, consulting services, and hardware sales. In the fiscal year 2023, the company reported the following fictional figures:
12
 
13
- Software Subscriptions: $150 million
14
- Consulting Services: $50 million
15
- Hardware Sales: $30 million
16
-
17
- Total Revenue: $230 million
18
- Costs and Expenses
19
-
20
- The company's costs are primarily associated with research and development (R&D), sales and marketing, and administrative expenses. The following are the fabricated figures for the fiscal year 2023:
21
-
22
- R&D: $70 million
23
- Sales and Marketing: $50 million
24
- Administrative Expenses: $30 million
25
-
26
- Total Expenses: $150 million
27
-
28
- Net Profit for FY 2023: $80 million
29
- Strategic Development Plan
30
-
31
- To ensure continued growth and profitability, TechInnovate Inc. needs to focus on several key areas. The plan below outlines strategic initiatives, using fictitious data and assumptions for illustration purposes.
32
- Expansion into Emerging Markets
33
-
34
- With the global digital transformation accelerating, emerging markets present significant opportunities for growth. TechInnovate plans to establish operations in countries such as India, Brazil, and South Africa, aiming to increase revenue by 20% over the next three years.
35
- Product Innovation
36
-
37
- Continued investment in R&D is crucial for maintaining a competitive edge. TechInnovate allocates $100 million towards developing new AI algorithms and cloud solutions, projecting a 30% increase in software subscription revenue by 2025.
38
- Partnerships and Acquisitions
39
-
40
- To diversify its product portfolio and strengthen its market position, TechInnovate will seek strategic partnerships and acquisitions. The company has set aside a fund of $200 million for acquiring startups in AI and IoT sectors, expecting to enhance its offering and drive a 25% revenue growth.
41
- Risk Management
42
-
43
- Identifying and mitigating risks is essential for the sustainability of TechInnovate's growth. Potential risks include:
44
-
45
- Market Competition: Continuous analysis and adaptation to competitive strategies are required.
46
- Technological Changes: Staying ahead in innovation to meet evolving market demands.
47
- Regulatory Compliance: Ensuring adherence to global regulations, especially in new markets.
48
-
49
- Conclusion
50
-
51
- This document presents a fictional financial analysis and strategic development plan for TechInnovate Inc., using made-up names and figures. It is designed to showcase how the company can navigate the complexities of the global digital market, leveraging opportunities for growth while managing potential risks. All data and scenarios mentioned herein are entirely fabricated and should not be considered reflective of any real individuals or entities.
52
- Disclaimer
53
-
54
- The financial figures, names, and scenarios presented in this document are purely fictional and created for illustrative purposes only. They do not represent real individuals, companies, or actual financial data. This document is intended as a hypothetical exercise in financial analysis and strategic planning.
 
1
+ Hello, my name is David Johnson and I live in Maine.
2
+ My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
3
 
4
+ On September 18 I visited microsoft.com and sent an email to test@presidio.site, from the IP 192.168.0.1.
 
5
 
6
+ My passport: 191280342 and my phone number: (212) 555-1234.
 
 
7
 
8
+ This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
9
 
10
+ Kate's social security number is 078-05-1126. Her driver license? it is 1234567A.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
original_document_uuid_mapping.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "078-05-1126": "11e90148",
3
+ "1234567A": "c6652117",
4
+ "16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ": "be0eddce",
5
+ "18": "ee1fa38f",
6
+ "191280342": "53b52085",
7
+ "192.168.0.1": "2d66aefe",
8
+ "212": "81133b16",
9
+ "4095-2609-9393-4932": "1155d2a5",
10
+ "555-1234": "08aeb9f5",
11
+ "954567876544": "e1a23ada",
12
+ "Account": "3730179e",
13
+ "Bank": "5a60bcfa",
14
+ "David": "84e24eb8",
15
+ "Her": "67e5e61e",
16
+ "I": "9f2963af",
17
+ "IL150120690000003111111": "ab18e6bb",
18
+ "International": "11e27c41",
19
+ "Johnson": "0ea0e35c",
20
+ "Kate": "7ab7335c",
21
+ "Maine": "5d0a593d",
22
+ "My": "6d7a71e0",
23
+ "Number": "97008100",
24
+ "September": "b926f540",
25
+ "This": "3bbcaf02",
26
+ "crypto": "d8e7c7fc",
27
+ "microsoft.com": "1e63e774",
28
+ "my": "0a182475",
29
+ "please": "f4953628",
30
+ "status": "8ff83dc3",
31
+ "[email protected]": "f70cf334",
32
+ "wallet": "c627b841",
33
+ "you": "773b08cb"
34
+ }
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  concrete-ml==1.5.0rc0
2
  gensim==4.3.2
3
- gradio==3.40.1
 
 
1
  concrete-ml==1.5.0rc0
2
  gensim==4.3.2
3
+ gradio==3.40.1
4
+ openai==1.13.3
utils_demo.py CHANGED
@@ -1,274 +0,0 @@
1
-
2
- import logging
3
- import re
4
- import string
5
- from flair.data import Sentence
6
- from flair.models import SequenceTagger
7
- from presidio_analyzer import AnalyzerEngine
8
- from presidio_anonymizer import AnonymizerEngine
9
-
10
-
11
-
12
- entity_label_to_code_map = {'<PERSON>': 0,
13
- '<O>': 1,
14
- '<MISC>-<NRP>': 2,
15
- '<NUMBER>': 3,
16
- '<PER>-<LOCATION>': 4,
17
- '<LOC>': 5,
18
- '<MISC>': 6, # Miscellaneous: doesn't fall into the more common categories of PERSON, LOCATION, ORGANIZATION,
19
- '<DATE_TIME>': 7,
20
- '<LOCATION>': 8,
21
- '<PRONOUNS>': 9,
22
- '<IN_PAN>': 10,
23
- '<MISC>-<DATE_TIME>': 11,
24
- '<ORG>': 12,
25
- '<MISC>-<IN_PAN>': 13,
26
- '<MISC>-<LOCATION>': 14,
27
- '<PER>': 15,
28
- '<MISC>-<PERSON>': 16,
29
- '<LOC>-<PERSON>': 17,
30
- '<PHONE_NUMBER>': 18,
31
- '<LOC>-<DATE_TIME>': 19,
32
- '<LOC>-<NRP>': 20,
33
- '<NRP>': 21,
34
- '<ORG>-<PERSON>': 22,
35
- '<PER>-<NRP>': 23,
36
- '<ORG>-<LOCATION>': 24,
37
- '<PER>-<DATE_TIME>': 25,
38
- '<PER>-<IN_PAN>': 26,
39
- '<ORG>-<IN_PAN>': 27,
40
- '<ORG>-<NRP>': 28,
41
- '<US_DRIVER_LICENSE>': 29,
42
- '<KEY <EMAIL_ADDRESS>': 30,
43
- '<US_BANK_NUMBER>': 33,
44
- '<IN_AADHAAR>': 34,
45
- '<CRYPTO>': 35,
46
- '<IP_ADDRESS>': 36,
47
- '<EMAIL_ADDRESS>': 35,
48
- '<US_PASSPORT>': 36,
49
- '<US_SSN>': 37,
50
- '<MISC>-<URL>': 38}
51
-
52
-
53
- pronoun_list = [
54
- 'I', 'i', 'me', 'my', 'mine', 'myself', 'you', 'your', 'yours', "I'm", "I am",\
55
- 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "i'm", \
56
- 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', \
57
- 'their', 'theirs', 'themselves', 'we', 'us', 'our', 'ours', 'ourselves' \
58
- 'Me', 'My', 'Mine', 'Myself', 'You', 'Your', 'Yours', 'Yourself', 'Yourselves', \
59
- 'He', 'Him', 'His', 'Himself', 'She', 'Her', 'Hers', 'Herself', 'It', 'Its', 'Itself', \
60
- 'They', 'Them', 'Their', 'Theirs', 'Themselves', 'We', 'Us', 'Our', 'Ours', 'Ourselves',
61
- "Lady", "Madam", "Mr.", "Mister", "Sir", "Miss", "Ms.", "Mrs.", "Mr"
62
- ]
63
-
64
-
65
- privacy_category_codes = {'<PRIVATE>': 1, '<NON_PRIVATE>': 2, '<OTHER>': 3}
66
-
67
- punctuation_list = list(string.punctuation)
68
- punctuation_list.remove('%')
69
- punctuation_list.remove('$')
70
- punctuation_list = ''.join(punctuation_list)
71
-
72
- def get_word_boundaries(sentence):
73
- """ Find the start and end positions of each word in a sentence."""
74
- return [(match.start(), match.end()) for match in re.finditer(r'[^\s]+', sentence)]
75
-
76
-
77
- def fuse_ner_labels(flair_ner, presidio_ner, text_type="<PRIVATE>"):
78
- """Merges The NER labels from 'Flair' and 'Presidio' for a given text.
79
-
80
- We add take into account custom cases and predefined rules for entity classification.
81
- """
82
- merged_ner = []
83
-
84
- # Sanity check
85
- assert len(flair_ner) == len(presidio_ner)
86
-
87
- for i, ((w1, n1), (w2, n2)) in enumerate(zip(presidio_ner, flair_ner)):
88
-
89
- assert w1 == w2
90
-
91
- if w1.lower() in pronoun_list:
92
- common_ner = "<PRONOUNS>"
93
- # elif w1 in ['A+', 'A-', 'B+', 'B-', 'AB+', 'AB-', 'O+', 'O-']:
94
- # common_ner = "<PRIVATE>"
95
- elif n1 == "<O>" and n2 == "<O>":
96
- if w1.lower() in ["am", "'m"] and (i - 1) >= 0 and presidio_ner[i - 1][0].lower() == 'i':
97
- common_ner = "<PRONOUNS>"
98
-
99
- elif bool(re.match(r'(?<!\S)[\$€]?(?:\d{1,3}(?:[ ,.]\d{3})*|\d+)(?:\.\d+)?%?', w1)):
100
- common_ner = "<NUMBER>"
101
- else:
102
- common_ner = '<O>'
103
- elif n1 in n2:
104
- common_ner = n2
105
- elif n1 == '<O>' and n2 != '<O>':
106
- common_ner = n2
107
- elif n2 == '<O>' and n1 != '<O>':
108
- common_ner = f"<{n1}>"
109
- else:
110
- common_ner = f"<{n1}>-{n2}"
111
- try:
112
- common_binary_label = 0 if common_ner =="<O>" else 1
113
-
114
- except:
115
- print(f"ERROR: common_binary_label = 0 if common_ner =='<O>' else 1 | {w1=}, {w2=}, {n1=}, {n2=}")
116
-
117
- if common_ner not in entity_label_to_code_map.keys():
118
- common_multi_label = len(entity_label_to_code_map)
119
- if common_ner not in entity_label_to_code_map.keys():
120
- print("NOT in KEY", common_ner)
121
- entity_label_to_code_map[common_ner] = common_multi_label
122
- else:
123
- common_multi_label = entity_label_to_code_map[common_ner]
124
-
125
- is_private = text_type if common_ner != '<O>' else '<OTHER>'
126
-
127
- merged_ner.append([w1, common_ner, is_private, privacy_category_codes[is_private], common_binary_label, common_multi_label])
128
-
129
- return merged_ner
130
-
131
- analyzer = AnalyzerEngine()
132
- anonymizer = AnonymizerEngine()
133
-
134
-
135
- def apply_presidio_model(sentence, verbose=True):
136
- """Get Presidio predictions."""
137
-
138
- if verbose: print(f"{sentence=}")
139
- # anonymized_text looks like: ['<PERSON>', 'went', 'to', 'Pitier', 'Hospital', ...]
140
-
141
- anonymized_text = anonymizer.anonymize(text=sentence, analyzer_results=analyzer.analyze(text=sentence, language='en'))
142
- anonymized_text = anonymized_text.__dict__['text'].split()
143
- anonymized_text = ' '.join(anonymized_text)
144
- next_word_to_concate = None
145
-
146
- if verbose: print(f"{anonymized_text=}")
147
- if verbose: print(f"{anonymized_text.split('<')=}")
148
-
149
- start_index, label = 0, []
150
- previous_label = None
151
-
152
- for i, before_split in enumerate(anonymized_text.split('<')):
153
-
154
- if verbose:
155
- print(f"\nSubseq_{i}: {before_split=}")
156
-
157
- if i == 0:
158
- assert len(before_split) == len(sentence[start_index: len(before_split)])
159
- start_index = len(before_split)
160
- label.extend([(s, '<O>') for s in before_split.split()])
161
- else:
162
- after_split = before_split.split(">")
163
- if verbose:
164
- print(f" -----> ", after_split)
165
- print(sentence[start_index:])
166
- print(sentence[start_index:].find(after_split[-1]))
167
-
168
- start2_index = start_index + sentence[start_index:].find(after_split[-1])
169
- end2_index = start2_index + len(after_split[-1])
170
-
171
- if verbose:
172
- print(f"Sanity check: '[{sentence[start2_index: end2_index]}]' VS '[{after_split[-1]}]'")
173
- print(f"Hidden part: sentence[{start2_index}: {end2_index}] = {sentence[start2_index: end2_index]}")
174
-
175
- assert sentence[start2_index: end2_index] == after_split[-1]
176
-
177
- start2_index = start2_index if start2_index != start_index else len(sentence)
178
-
179
- for j, anonimyzed_word in enumerate((sentence[start_index: start2_index]).split()):
180
- if next_word_to_concate != None and j == 0:
181
- label.append((f"{next_word_to_concate}{anonimyzed_word}", f"<{after_split[0]}>"))
182
- next_word_to_concate = None
183
- else:
184
- label.append((anonimyzed_word, f"<{after_split[0]}>"))
185
-
186
- previous_label = f"<{after_split[0]}>"
187
-
188
- if len(sentence[start2_index: end2_index]) >= 1 and after_split[-1][-1] != ' ' and i != len(anonymized_text.split('<')) - 1:
189
- if verbose: print("Is there a space after?", after_split, after_split[-1][-1], i, len(anonymized_text.split('<')))
190
-
191
- for j, anonimyzed_word in enumerate((after_split[-1]).split()[:-1]):
192
- label.append((anonimyzed_word, "<O>"))
193
-
194
- next_word_to_concate = (after_split[-1]).split()[-1]
195
-
196
- elif len(sentence[start2_index: end2_index]) >= 1 and after_split[-1][0] != ' ' and i != len(anonymized_text.split('<')) - 1:
197
- if verbose: print("Is there a space before?", after_split, after_split[-1][0], i, len(anonymized_text.split('<')))
198
-
199
- label[-1] = (f"{label[-1][0]}{after_split[-1].split()[0]}", previous_label)
200
-
201
- for j, anonimyzed_word in enumerate((after_split[-1]).split()[1:]):
202
- label.append((anonimyzed_word, "<O>"))
203
-
204
- else:
205
- for j, anonimyzed_word in enumerate((after_split[-1]).split()):
206
- label.append((anonimyzed_word, "<O>"))
207
-
208
- start_index = end2_index
209
-
210
- return label
211
-
212
-
213
- def apply_flair_model(original_sentence):
214
- """Get Flair predictions."""
215
-
216
- logging.getLogger('flair').setLevel(logging.WARNING)
217
-
218
- tagger = SequenceTagger.load("flair/ner-english-large")
219
- flair_sentence = Sentence(original_sentence)
220
- tagger.predict(flair_sentence)
221
-
222
- word_boundaries = get_word_boundaries(original_sentence)
223
-
224
- ner = [[i_token.form, \
225
- b_token.get_label().value, \
226
- i_token.get_label().score, \
227
- i_token.start_position, \
228
- i_token.end_position] for b_token in flair_sentence.get_spans("ner") for i_token in b_token]
229
-
230
- ner_labels, ner_index = [], 0
231
-
232
- for start, end in word_boundaries:
233
- word_from_text = original_sentence[start:end]
234
- if ner_index < len(ner):
235
- form, label, _, s, e = ner[ner_index]
236
-
237
- if (s, e) == (start, end) and word_from_text == form:
238
- ner_labels.append((word_from_text, label))
239
- ner_index += 1
240
- else:
241
- ner_labels.append((word_from_text, "<O>"))
242
- else:
243
- ner_labels.append((word_from_text, "<O>"))
244
-
245
- assert len(ner_labels) == len(word_boundaries)
246
-
247
- return ner_labels
248
-
249
-
250
- def preprocess_sentences(sentence, verbose=False):
251
- """Preprocess the sentence."""
252
-
253
- # Removing Extra Newlines:
254
- sentence = re.sub(r'\n+', ' ', sentence)
255
- if verbose: print(sentence)
256
-
257
- # Collapsing Multiple Spaces:
258
- sentence = re.sub(' +', ' ', sentence)
259
- if verbose: print(sentence)
260
-
261
- # Handling Apostrophes in Possessives:
262
- sentence = re.sub(r"'s\b", " s", sentence)
263
- if verbose: print(sentence)
264
-
265
- # Removing Spaces Before Punctuation:
266
- sentence = re.sub(r'\s([,.!?;:])', r'\1', sentence)
267
- if verbose: print(sentence)
268
-
269
- # Pattern for Matching Leading or Trailing Punctuation:
270
- pattern = r'(?<!\w)[{}]|[{}](?!\w)'.format(re.escape(punctuation_list), re.escape(punctuation_list))
271
- sentence = re.sub(pattern, '', sentence)
272
- if verbose: print(sentence)
273
-
274
- return sentence