Princess3 commited on
Commit
b65e3b1
1 Parent(s): 898a19f

Update 2.py

Browse files
Files changed (1) hide show
  1. 2.py +5 -19
2.py CHANGED
@@ -8,20 +8,6 @@ from tqdm import tqdm
8
 
9
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
10
 
11
-
12
- # Set the cache directory path
13
- cache_dir = '/app/cache'
14
-
15
- # Create the directory if it doesn't exist
16
- if not os.path.exists(cache_dir):
17
- os.makedirs(cache_dir)
18
-
19
- # Set the environment variable
20
- os.environ['HF_HOME'] = cache_dir
21
-
22
- # Verify the environment variable is set
23
- print(f"HF_HOME is set to: {os.environ['HF_HOME']}")
24
-
25
  class Config: E, H, N, C, B, M, S, V, W, L, D = 512, 32, 1024, 256, 128, 20000, 2048, 1e5, 4000, 2e-4, .15
26
 
27
  class MyDataset(Dataset):
@@ -90,7 +76,7 @@ def create_model_from_folder(folder_path):
90
  s[os.path.basename(r).replace('.', '_')].extend(parse_xml(os.path.join(r, file)))
91
  return DM(dict(s))
92
 
93
- def create_embeddings_and_sentences(folder_path, model_name="pile-of-law/legalbert-large-1.7M-1"):
94
  t, m, embeddings, ds = AutoTokenizer.from_pretrained(model_name), AutoModel.from_pretrained(model_name), [], []
95
  for r, d, f in os.walk(folder_path):
96
  for file in f:
@@ -99,15 +85,15 @@ def create_embeddings_and_sentences(folder_path, model_name="pile-of-law/legalbe
99
  for e in root.iter():
100
  if e.text:
101
  text = e.text.strip()
102
- i = t(text, return_tensors="pt", truncation=True, padding=True)
103
  with torch.no_grad():
104
  embeddings.append(m(**i).last_hidden_state.mean(dim=1).numpy())
105
  ds.append(text)
106
  return np.vstack(embeddings), ds
107
 
108
- def query_vector_similarity(query, embeddings, ds, model_name="pile-of-law/legalbert-large-1.7M-2"):
109
  t, m = AutoTokenizer.from_pretrained(model_name), AutoModel.from_pretrained(model_name)
110
- i = t(query, return_tensors="pt", truncation=True, padding=True)
111
  with torch.no_grad():
112
  qe = m(**i).last_hidden_state.mean(dim=1).numpy()
113
  return [ds[i] for i in cosine_similarity(qe, embeddings)[0].argsort()[-5:][::-1]]
@@ -122,7 +108,7 @@ def fetch_courtlistener_data(query):
122
  return []
123
 
124
  def main():
125
- folder_path, model = 'data', create_model_from_folder('data')
126
  logging.info(f"Created dynamic PyTorch model with sections: {list(model.s.keys())}")
127
  embeddings, ds = create_embeddings_and_sentences(folder_path)
128
  accelerator, optimizer, criterion, num_epochs = Accelerator(), torch.optim.Adam(model.parameters(), lr=0.001), nn.CrossEntropyLoss(), 10
 
8
 
9
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  class Config: E, H, N, C, B, M, S, V, W, L, D = 512, 32, 1024, 256, 128, 20000, 2048, 1e5, 4000, 2e-4, .15
12
 
13
  class MyDataset(Dataset):
 
76
  s[os.path.basename(r).replace('.', '_')].extend(parse_xml(os.path.join(r, file)))
77
  return DM(dict(s))
78
 
79
+ def create_embeddings_and_sentences(folder_path, model_name="pile-of-law/legalbert-large-1.7M-1", max_length=512):
80
  t, m, embeddings, ds = AutoTokenizer.from_pretrained(model_name), AutoModel.from_pretrained(model_name), [], []
81
  for r, d, f in os.walk(folder_path):
82
  for file in f:
 
85
  for e in root.iter():
86
  if e.text:
87
  text = e.text.strip()
88
+ i = t(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
89
  with torch.no_grad():
90
  embeddings.append(m(**i).last_hidden_state.mean(dim=1).numpy())
91
  ds.append(text)
92
  return np.vstack(embeddings), ds
93
 
94
+ def query_vector_similarity(query, embeddings, ds, model_name="pile-of-law/legalbert-large-1.7M-2", max_length=512):
95
  t, m = AutoTokenizer.from_pretrained(model_name), AutoModel.from_pretrained(model_name)
96
+ i = t(query, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
97
  with torch.no_grad():
98
  qe = m(**i).last_hidden_state.mean(dim=1).numpy()
99
  return [ds[i] for i in cosine_similarity(qe, embeddings)[0].argsort()[-5:][::-1]]
 
108
  return []
109
 
110
  def main():
111
+ folder_path, model = 'data', create_model_from_folder('Xml_Data')
112
  logging.info(f"Created dynamic PyTorch model with sections: {list(model.s.keys())}")
113
  embeddings, ds = create_embeddings_and_sentences(folder_path)
114
  accelerator, optimizer, criterion, num_epochs = Accelerator(), torch.optim.Adam(model.parameters(), lr=0.001), nn.CrossEntropyLoss(), 10