Princess3 commited on
Commit
930e8c0
·
verified ·
1 Parent(s): c13142d

Upload 2 files

Browse files
Files changed (2) hide show
  1. 1.py +194 -0
  2. 2.py +131 -0
1.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, xml.etree.ElementTree as ET, torch, torch.nn as nn, torch.nn.functional as F, numpy as np, logging, requests
2
+ from collections import defaultdict
3
+ from torch.utils.data import DataLoader, Dataset, TensorDataset
4
+ from transformers import AutoTokenizer, AutoModel
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ from accelerate import Accelerator
7
+ from tqdm import tqdm
8
+
9
+ # Logging setup
10
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11
+
12
+ # Configuration class
13
+ class Config:
14
+ E, H, N, C, B = 512, 32, 1024, 256, 128
15
+ M, S, V = 20000, 2048, 1e5
16
+ W, L, D = 4000, 2e-4, .15
17
+
18
+ # Custom Dataset
19
+ class MyDataset(Dataset):
20
+ def __init__(self, data, labels):
21
+ self.data = data
22
+ self.labels = labels
23
+
24
+ def __len__(self):
25
+ return len(self.data)
26
+
27
+ def __getitem__(self, index):
28
+ return self.data[index], self.labels[index]
29
+
30
+ # Custom Model
31
+ class MyModel(nn.Module):
32
+ def __init__(self, input_size, hidden_size, output_size):
33
+ super(MyModel, self).__init__()
34
+ self.hidden = nn.Linear(input_size, hidden_size)
35
+ self.output = nn.Linear(hidden_size, output_size)
36
+ self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
37
+ self.fc = nn.Linear(hidden_size, output_size)
38
+
39
+ def forward(self, x):
40
+ x = torch.relu(self.hidden(x))
41
+ h0 = torch.zeros(1, x.size(0), hidden_size)
42
+ c0 = torch.zeros(1, x.size(0), hidden_size)
43
+ out, _ = self.lstm(x, (h0, c0))
44
+ out = self.fc(out[:, -1, :])
45
+ return out
46
+
47
+ # Memory Network
48
+ class MemoryNetwork:
49
+ def __init__(self, memory_size, embedding_size):
50
+ self.memory_size = memory_size
51
+ self.embedding_size = embedding_size
52
+ self.memory = np.zeros((memory_size, embedding_size))
53
+ self.usage = np.zeros(memory_size)
54
+
55
+ def store(self, data):
56
+ index = np.argmin(self.usage)
57
+ self.memory[index] = data
58
+ self.usage[index] = 1.0
59
+
60
+ def retrieve(self, query):
61
+ similarities = np.dot(self.memory, query)
62
+ index = np.argmax(similarities)
63
+ self.usage[index] += 1.0
64
+ return self.memory[index]
65
+
66
+ def update_usage(self):
67
+ self.usage *= 0.99
68
+
69
+ # Dynamic Model
70
+ class DM(nn.Module):
71
+ def __init__(self, s):
72
+ super(DM, self).__init__()
73
+ self.s = nn.ModuleDict()
74
+ for sn, l in s.items():
75
+ self.s[sn] = nn.ModuleList([self.cl(lp) for lp in l])
76
+
77
+ def cl(self, lp):
78
+ l = [nn.Linear(lp['input_size'], lp['output_size'])]
79
+ if lp.get('batch_norm', True): l.append(nn.BatchNorm1d(lp['output_size']))
80
+ a = lp.get('activation', 'relu')
81
+ if a == 'relu': l.append(nn.ReLU(inplace=True))
82
+ elif a == 'tanh': l.append(nn.Tanh())
83
+ elif a == 'sigmoid': l.append(nn.Sigmoid())
84
+ elif a == 'leaky_relu': l.append(nn.LeakyReLU(negative_slope=0.01, inplace=True))
85
+ elif a == 'elu': l.append(nn.ELU(alpha=1.0, inplace=True))
86
+ if dr := lp.get('dropout', 0.0): l.append(nn.Dropout(p=dr))
87
+ return nn.Sequential(*l)
88
+
89
+ def forward(self, x, sn=None):
90
+ if sn is not None:
91
+ for l in self.s[sn]: x = l(x)
92
+ else:
93
+ for sn, l in self.s.items():
94
+ for l in l: x = l(x)
95
+ return x
96
+
97
+ # Parsing XML
98
+ def parse_xml(file_path):
99
+ t = ET.parse(file_path)
100
+ r = t.getroot()
101
+ l = []
102
+ for ly in r.findall('.//layer'):
103
+ lp = {'input_size': int(ly.get('input_size', 128)), 'output_size': int(ly.get('output_size', 256)), 'activation': ly.get('activation', 'relu').lower()}
104
+ l.append(lp)
105
+ return l
106
+
107
+ # Create Model from Folder
108
+ def create_model_from_folder(folder_path):
109
+ s = defaultdict(list)
110
+ for r, d, f in os.walk(folder_path):
111
+ for file in f:
112
+ if file.endswith('.xml'):
113
+ fp = os.path.join(r, file)
114
+ l = parse_xml(fp)
115
+ sn = os.path.basename(r).replace('.', '_')
116
+ s[sn].extend(l)
117
+ return DM(dict(s))
118
+
119
+ # Create Embeddings and Sentences
120
+ def create_embeddings_and_sentences(folder_path, model_name="sentence-transformers/all-MiniLM-L6-v2"):
121
+ t = AutoTokenizer.from_pretrained(model_name)
122
+ m = AutoModel.from_pretrained(model_name)
123
+ embeddings, ds = [], []
124
+ for r, d, f in os.walk(folder_path):
125
+ for file in f:
126
+ if file.endswith('.xml'):
127
+ fp = os.path.join(r, file)
128
+ tree = ET.parse(fp)
129
+ root = tree.getroot()
130
+ for e in root.iter():
131
+ if e.text:
132
+ text = e.text.strip()
133
+ i = t(text, return_tensors="pt", truncation=True, padding=True)
134
+ with torch.no_grad():
135
+ emb = m(**i).last_hidden_state.mean(dim=1).numpy()
136
+ embeddings.append(emb)
137
+ ds.append(text)
138
+ embeddings = np.vstack(embeddings)
139
+ return embeddings, ds
140
+
141
+ # Query Vector Similarity
142
+ def query_vector_similarity(query, embeddings, ds, model_name="sentence-transformers/all-MiniLM-L6-v2"):
143
+ t = AutoTokenizer.from_pretrained(model_name)
144
+ m = AutoModel.from_pretrained(model_name)
145
+ i = t(query, return_tensors="pt", truncation=True, padding=True)
146
+ with torch.no_grad():
147
+ qe = m(**i).last_hidden_state.mean(dim=1).numpy()
148
+ similarities = cosine_similarity(qe, embeddings)
149
+ top_k_indices = similarities[0].argsort()[-5:][::-1]
150
+ return [ds[i] for i in top_k_indices]
151
+
152
+ # Fetch CourtListener Data
153
+ def fetch_courtlistener_data(query):
154
+ base_url = "https://nzlii.org/cgi-bin/sinosrch.cgi"
155
+ params = {"method": "auto", "query": query, "meta": "/nz", "results": "50", "format": "json"}
156
+ try:
157
+ response = requests.get(base_url, params=params, headers={"Accept": "application/json"}, timeout=10)
158
+ response.raise_for_status()
159
+ results = response.json().get("results", [])
160
+ return [{"title": r.get("title", ""), "citation": r.get("citation", ""), "date": r.get("date", ""), "court": r.get("court", ""), "summary": r.get("summary", ""), "url": r.get("url", "")} for r in results]
161
+ except requests.exceptions.RequestException as e:
162
+ logging.error(f"Failed to fetch data from NZLII API: {str(e)}")
163
+ return []
164
+
165
+ # Main function
166
+ def main():
167
+ folder_path = 'data'
168
+ model = create_model_from_folder(folder_path)
169
+ logging.info(f"Created dynamic PyTorch model with sections: {list(model.s.keys())}")
170
+ embeddings, ds = create_embeddings_and_sentences(folder_path)
171
+ accelerator = Accelerator()
172
+ optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
173
+ criterion = nn.CrossEntropyLoss()
174
+ num_epochs = 10
175
+ dataset = MyDataset(torch.randn(1000, 10), torch.randint(0, 5, (1000,)))
176
+ dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
177
+ model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
178
+ for epoch in range(num_epochs):
179
+ model.train()
180
+ for batch_data, batch_labels in dataloader:
181
+ optimizer.zero_grad()
182
+ outputs = model(batch_data)
183
+ loss = criterion(outputs, batch_labels)
184
+ accelerator.backward(loss)
185
+ optimizer.step()
186
+ logging.info(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
187
+ query = "example query text"
188
+ results = query_vector_similarity(query, embeddings, ds)
189
+ logging.info(f"Query results: {results}")
190
+ courtlistener_data = fetch_courtlistener_data(query)
191
+ logging.info(f"CourtListener API results: {courtlistener_data}")
192
+
193
+ if __name__ == "__main__":
194
+ main()
2.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, xml.etree.ElementTree as ET, torch, torch.nn as nn, numpy as np, logging, requests
2
+ from collections import defaultdict
3
+ from torch.utils.data import DataLoader, Dataset
4
+ from transformers import AutoTokenizer, AutoModel
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ from accelerate import Accelerator
7
+ from tqdm import tqdm
8
+
9
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
10
+
11
+ class Config: E, H, N, C, B, M, S, V, W, L, D = 512, 32, 1024, 256, 128, 20000, 2048, 1e5, 4000, 2e-4, .15
12
+
13
+ class MyDataset(Dataset):
14
+ def __init__(self, data, labels): self.data, self.labels = data, labels
15
+ def __len__(self): return len(self.data)
16
+ def __getitem__(self, index): return self.data[index], self.labels[index]
17
+
18
+ class MyModel(nn.Module):
19
+ def __init__(self, input_size, hidden_size, output_size):
20
+ super(MyModel, self).__init__()
21
+ self.hidden, self.output = nn.Linear(input_size, hidden_size), nn.Linear(hidden_size, output_size)
22
+ self.lstm, self.fc = nn.LSTM(input_size, hidden_size, batch_first=True), nn.Linear(hidden_size, output_size)
23
+ def forward(self, x):
24
+ x = torch.relu(self.hidden(x))
25
+ h0, c0 = torch.zeros(1, x.size(0), hidden_size), torch.zeros(1, x.size(0), hidden_size)
26
+ out, _ = self.lstm(x, (h0, c0))
27
+ return self.fc(out[:, -1, :])
28
+
29
+ class MemoryNetwork:
30
+ def __init__(self, memory_size, embedding_size):
31
+ self.memory, self.usage = np.zeros((memory_size, embedding_size)), np.zeros(memory_size)
32
+ def store(self, data):
33
+ index = np.argmin(self.usage)
34
+ self.memory[index], self.usage[index] = data, 1.0
35
+ def retrieve(self, query):
36
+ index = np.argmax(np.dot(self.memory, query))
37
+ self.usage[index] += 1.0
38
+ return self.memory[index]
39
+ def update_usage(self): self.usage *= 0.99
40
+
41
+ class DM(nn.Module):
42
+ def __init__(self, s):
43
+ super(DM, self).__init__()
44
+ self.s = nn.ModuleDict({sn: nn.ModuleList([self.cl(lp) for lp in l]) for sn, l in s.items()})
45
+ def cl(self, lp):
46
+ l = [nn.Linear(lp['input_size'], lp['output_size'])]
47
+ if lp.get('batch_norm', True): l.append(nn.BatchNorm1d(lp['output_size']))
48
+ a = lp.get('activation', 'relu')
49
+ if a == 'relu': l.append(nn.ReLU(inplace=True))
50
+ elif a == 'tanh': l.append(nn.Tanh())
51
+ elif a == 'sigmoid': l.append(nn.Sigmoid())
52
+ elif a == 'leaky_relu': l.append(nn.LeakyReLU(negative_slope=0.01, inplace=True))
53
+ elif a == 'elu': l.append(nn.ELU(alpha=1.0, inplace=True))
54
+ if dr := lp.get('dropout', 0.0): l.append(nn.Dropout(p=dr))
55
+ return nn.Sequential(*l)
56
+ def forward(self, x, sn=None):
57
+ if sn:
58
+ for l in self.s[sn]: x = l(x)
59
+ else:
60
+ for sn, l in self.s.items():
61
+ for l in l: x = l(x)
62
+ return x
63
+
64
+ def parse_xml(file_path):
65
+ t, r, l = ET.parse(file_path), ET.parse(file_path).getroot(), []
66
+ for ly in r.findall('.//layer'):
67
+ lp = {'input_size': int(ly.get('input_size', 128)), 'output_size': int(ly.get('output_size', 256)), 'activation': ly.get('activation', 'relu').lower()}
68
+ l.append(lp)
69
+ return l
70
+
71
+ def create_model_from_folder(folder_path):
72
+ s = defaultdict(list)
73
+ for r, d, f in os.walk(folder_path):
74
+ for file in f:
75
+ if file.endswith('.xml'):
76
+ s[os.path.basename(r).replace('.', '_')].extend(parse_xml(os.path.join(r, file)))
77
+ return DM(dict(s))
78
+
79
+ def create_embeddings_and_sentences(folder_path, model_name="sentence-transformers/all-MiniLM-L6-v2"):
80
+ t, m, embeddings, ds = AutoTokenizer.from_pretrained(model_name), AutoModel.from_pretrained(model_name), [], []
81
+ for r, d, f in os.walk(folder_path):
82
+ for file in f:
83
+ if file.endswith('.xml'):
84
+ tree, root = ET.parse(os.path.join(r, file)), ET.parse(os.path.join(r, file)).getroot()
85
+ for e in root.iter():
86
+ if e.text:
87
+ text = e.text.strip()
88
+ i = t(text, return_tensors="pt", truncation=True, padding=True)
89
+ with torch.no_grad():
90
+ embeddings.append(m(**i).last_hidden_state.mean(dim=1).numpy())
91
+ ds.append(text)
92
+ return np.vstack(embeddings), ds
93
+
94
+ def query_vector_similarity(query, embeddings, ds, model_name="sentence-transformers/all-MiniLM-L6-v2"):
95
+ t, m = AutoTokenizer.from_pretrained(model_name), AutoModel.from_pretrained(model_name)
96
+ i = t(query, return_tensors="pt", truncation=True, padding=True)
97
+ with torch.no_grad():
98
+ qe = m(**i).last_hidden_state.mean(dim=1).numpy()
99
+ return [ds[i] for i in cosine_similarity(qe, embeddings)[0].argsort()[-5:][::-1]]
100
+
101
+ def fetch_courtlistener_data(query):
102
+ try:
103
+ response = requests.get("https://nzlii.org/cgi-bin/sinosrch.cgi", params={"method": "auto", "query": query, "meta": "/nz", "results": "50", "format": "json"}, headers={"Accept": "application/json"}, timeout=10)
104
+ response.raise_for_status()
105
+ return [{"title": r.get("title", ""), "citation": r.get("citation", ""), "date": r.get("date", ""), "court": r.get("court", ""), "summary": r.get("summary", ""), "url": r.get("url", "")} for r in response.json().get("results", [])]
106
+ except requests.exceptions.RequestException as e:
107
+ logging.error(f"Failed to fetch data from NZLII API: {str(e)}")
108
+ return []
109
+
110
+ def main():
111
+ folder_path, model = 'data', create_model_from_folder('data')
112
+ logging.info(f"Created dynamic PyTorch model with sections: {list(model.s.keys())}")
113
+ embeddings, ds = create_embeddings_and_sentences(folder_path)
114
+ accelerator, optimizer, criterion, num_epochs = Accelerator(), torch.optim.Adam(model.parameters(), lr=0.001), nn.CrossEntropyLoss(), 10
115
+ dataset, dataloader = MyDataset(torch.randn(1000, 10), torch.randint(0, 5, (1000,))), DataLoader(MyDataset(torch.randn(1000, 10), torch.randint(0, 5, (1000,))), batch_size=32, shuffle=True)
116
+ model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
117
+ for epoch in range(num_epochs):
118
+ model.train()
119
+ for batch_data, batch_labels in dataloader:
120
+ optimizer.zero_grad()
121
+ outputs = model(batch_data)
122
+ loss = criterion(outputs, batch_labels)
123
+ accelerator.backward(loss)
124
+ optimizer.step()
125
+ logging.info(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
126
+ query = "example query text"
127
+ logging.info(f"Query results: {query_vector_similarity(query, embeddings, ds)}")
128
+ logging.info(f"CourtListener API results: {fetch_courtlistener_data(query)}")
129
+
130
+ if __name__ == "__main__":
131
+ main()