Update 2.py
Browse files
2.py
CHANGED
@@ -8,20 +8,6 @@ from tqdm import tqdm
|
|
8 |
|
9 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
10 |
|
11 |
-
|
12 |
-
# Set the cache directory path
|
13 |
-
cache_dir = '/app/cache'
|
14 |
-
|
15 |
-
# Create the directory if it doesn't exist
|
16 |
-
if not os.path.exists(cache_dir):
|
17 |
-
os.makedirs(cache_dir)
|
18 |
-
|
19 |
-
# Set the environment variable
|
20 |
-
os.environ['HF_HOME'] = cache_dir
|
21 |
-
|
22 |
-
# Verify the environment variable is set
|
23 |
-
print(f"HF_HOME is set to: {os.environ['HF_HOME']}")
|
24 |
-
|
25 |
class Config: E, H, N, C, B, M, S, V, W, L, D = 512, 32, 1024, 256, 128, 20000, 2048, 1e5, 4000, 2e-4, .15
|
26 |
|
27 |
class MyDataset(Dataset):
|
@@ -90,7 +76,7 @@ def create_model_from_folder(folder_path):
|
|
90 |
s[os.path.basename(r).replace('.', '_')].extend(parse_xml(os.path.join(r, file)))
|
91 |
return DM(dict(s))
|
92 |
|
93 |
-
def create_embeddings_and_sentences(folder_path, model_name="pile-of-law/legalbert-large-1.7M-1"):
|
94 |
t, m, embeddings, ds = AutoTokenizer.from_pretrained(model_name), AutoModel.from_pretrained(model_name), [], []
|
95 |
for r, d, f in os.walk(folder_path):
|
96 |
for file in f:
|
@@ -99,15 +85,15 @@ def create_embeddings_and_sentences(folder_path, model_name="pile-of-law/legalbe
|
|
99 |
for e in root.iter():
|
100 |
if e.text:
|
101 |
text = e.text.strip()
|
102 |
-
i = t(text, return_tensors="pt", truncation=True, padding=True)
|
103 |
with torch.no_grad():
|
104 |
embeddings.append(m(**i).last_hidden_state.mean(dim=1).numpy())
|
105 |
ds.append(text)
|
106 |
return np.vstack(embeddings), ds
|
107 |
|
108 |
-
def query_vector_similarity(query, embeddings, ds, model_name="pile-of-law/legalbert-large-1.7M-2"):
|
109 |
t, m = AutoTokenizer.from_pretrained(model_name), AutoModel.from_pretrained(model_name)
|
110 |
-
i = t(query, return_tensors="pt", truncation=True, padding=True)
|
111 |
with torch.no_grad():
|
112 |
qe = m(**i).last_hidden_state.mean(dim=1).numpy()
|
113 |
return [ds[i] for i in cosine_similarity(qe, embeddings)[0].argsort()[-5:][::-1]]
|
@@ -122,7 +108,7 @@ def fetch_courtlistener_data(query):
|
|
122 |
return []
|
123 |
|
124 |
def main():
|
125 |
-
folder_path, model = 'data', create_model_from_folder('
|
126 |
logging.info(f"Created dynamic PyTorch model with sections: {list(model.s.keys())}")
|
127 |
embeddings, ds = create_embeddings_and_sentences(folder_path)
|
128 |
accelerator, optimizer, criterion, num_epochs = Accelerator(), torch.optim.Adam(model.parameters(), lr=0.001), nn.CrossEntropyLoss(), 10
|
|
|
8 |
|
9 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
class Config: E, H, N, C, B, M, S, V, W, L, D = 512, 32, 1024, 256, 128, 20000, 2048, 1e5, 4000, 2e-4, .15
|
12 |
|
13 |
class MyDataset(Dataset):
|
|
|
76 |
s[os.path.basename(r).replace('.', '_')].extend(parse_xml(os.path.join(r, file)))
|
77 |
return DM(dict(s))
|
78 |
|
79 |
+
def create_embeddings_and_sentences(folder_path, model_name="pile-of-law/legalbert-large-1.7M-1", max_length=512):
|
80 |
t, m, embeddings, ds = AutoTokenizer.from_pretrained(model_name), AutoModel.from_pretrained(model_name), [], []
|
81 |
for r, d, f in os.walk(folder_path):
|
82 |
for file in f:
|
|
|
85 |
for e in root.iter():
|
86 |
if e.text:
|
87 |
text = e.text.strip()
|
88 |
+
i = t(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
|
89 |
with torch.no_grad():
|
90 |
embeddings.append(m(**i).last_hidden_state.mean(dim=1).numpy())
|
91 |
ds.append(text)
|
92 |
return np.vstack(embeddings), ds
|
93 |
|
94 |
+
def query_vector_similarity(query, embeddings, ds, model_name="pile-of-law/legalbert-large-1.7M-2", max_length=512):
|
95 |
t, m = AutoTokenizer.from_pretrained(model_name), AutoModel.from_pretrained(model_name)
|
96 |
+
i = t(query, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
|
97 |
with torch.no_grad():
|
98 |
qe = m(**i).last_hidden_state.mean(dim=1).numpy()
|
99 |
return [ds[i] for i in cosine_similarity(qe, embeddings)[0].argsort()[-5:][::-1]]
|
|
|
108 |
return []
|
109 |
|
110 |
def main():
|
111 |
+
folder_path, model = 'data', create_model_from_folder('Xml_Data')
|
112 |
logging.info(f"Created dynamic PyTorch model with sections: {list(model.s.keys())}")
|
113 |
embeddings, ds = create_embeddings_and_sentences(folder_path)
|
114 |
accelerator, optimizer, criterion, num_epochs = Accelerator(), torch.optim.Adam(model.parameters(), lr=0.001), nn.CrossEntropyLoss(), 10
|