Princess3 commited on
Commit
a67dc80
1 Parent(s): 00f7a31

Upload 3 files

Browse files
Files changed (3) hide show
  1. aii.py +297 -0
  2. model.py +127 -0
  3. princesse.dockerfile +35 -0
aii.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os,time,re,glob,pickle,itertools,json,hashlib,asyncio,threading,concurrent.futures,warnings,logging
2
+ from pathlib import Path;from collections import defaultdict,Counter;from concurrent.futures import *
3
+ from typing import *;import numpy as np,torch,torch.nn as nn,torch.nn.functional as F,torch.optim as optim
4
+ from torch.utils.data import *;from torch_geometric.nn import *;from torch.distributions import Categorical
5
+ import nltk,networkx as nx,scipy.sparse as sp;from tqdm import tqdm;from queue import Queue
6
+ from nltk.tokenize import *;from nltk.stem import *;from nltk.corpus import *;from nltk.tag import *
7
+ from nltk.chunk import *;warnings.filterwarnings('ignore');P=print
8
+
9
+ print("Starting module initialization...")
10
+
11
+ class Cfg:
12
+ E,H,N,C,B=512,32,1024,256,128;M,S,V=20000,2048,1e5;W,L,D=4000,2e-4,.15
13
+ @classmethod
14
+ def d(cls):
15
+ print(f"Retrieving configuration dictionary with {len([k for k,v in cls.__dict__.items() if not k.startswith('_')])} items")
16
+ return{k:v for k,v in cls.__dict__.items()if not k.startswith('_')}
17
+
18
+ class Log:
19
+ def __init__(s,f='r.log'):
20
+ print(f"Initializing logger with file: {f}")
21
+ s.l=logging.getLogger('R');s.l.setLevel(logging.INFO)
22
+ for h in[logging.FileHandler(f),logging.StreamHandler()]:
23
+ h.setFormatter(logging.Formatter('%(asctime)s-%(name)s-%(levelname)s-%(message)s'))
24
+ s.l.addHandler(h)
25
+ def i(s,m):
26
+ print(f"INFO: {m}")
27
+ s.l.info(m)
28
+ def e(s,m):
29
+ print(f"ERROR: {m}")
30
+ s.l.error(m)
31
+ def s(s,m):
32
+ print(f"SUCCESS: {m}")
33
+ s.l.info(f"\033[92m{m}\033[0m")
34
+
35
+ class Res:
36
+ R={'t':{'p','a'},'g':{'u','a'},'c':{'s','w','b','t'}}
37
+ def __init__(s):
38
+ print("Initializing Resource Manager...")
39
+ s.l=Log();s.c={};s._i()
40
+ P('Resource manager initialized')
41
+
42
+ def _i(s):
43
+ print("Initializing NLP components...")
44
+ s.d={'l':WordNetLemmatizer(),'s':PorterStemmer(),'t':ToktokTokenizer(),
45
+ 'p':s._p(),'r':RegexpParser(s.G)}
46
+ P('Components initialized')
47
+
48
+ def _p(s):
49
+ print("Processing tagged sentences...")
50
+ raw_sents = nltk.corpus.brown.tagged_sents()[:10000]
51
+ t = []
52
+ for sent in raw_sents:
53
+ if sent:
54
+ t.extend((word, tag) for word, tag in sent if word and tag)
55
+ return TrigramTagger(train=[t], backoff=BigramTagger([t], backoff=UnigramTagger([t])))
56
+
57
+ def p(s,t):
58
+ print(f"Processing text input of length: {len(t)}")
59
+ k=s.d['t'].tokenize(t)
60
+ f={'t':k,'p':s.d['p'].tag(k),'l':[s.d['l'].lemmatize(x)for x in k],
61
+ 's':[s.d['s'].stem(x)for x in k]}
62
+ with ThreadPoolExecutor(2)as e:
63
+ f['r']=e.submit(s.d['r'].parse,f['p']).result()
64
+ P(f'Processed text: {len(k)} tokens')
65
+ return f
66
+
67
+ G = """
68
+ NP: {<DT|PP\$>?<JJ>*<NN.*>+}
69
+ VP: {<VB.*><NP|PP|CLAUSE>+}
70
+ CLAUSE: {<NP><VP>}
71
+ """
72
+
73
+ def _i(s):
74
+ s.d = {
75
+ 'l': WordNetLemmatizer(),
76
+ 's': PorterStemmer(),
77
+ 't': ToktokTokenizer(),
78
+ 'p': s._p(),
79
+ 'r': RegexpParser(s.G.strip()) # Clean whitespace
80
+ }
81
+ P('Components initialized with enhanced grammar')
82
+ class TB(nn.Module):
83
+ def __init__(s,d,h,r=4,p=Cfg.D):
84
+ super().__init__()
85
+ s.a=nn.MultiheadAttention(d,h,p);s.m=nn.Sequential(nn.Linear(d,int(d*r)),nn.GELU(),
86
+ nn.Dropout(p),nn.Linear(int(d*r),d),nn.Dropout(p));s.n=nn.ModuleList([nn.LayerNorm(d)for _ in'123'])
87
+ s.g=GATConv(d,d,4,p);s.f=nn.Sequential(nn.Linear(d,d),nn.Sigmoid())
88
+ P(f'Transformer block initialized: dim={d}, heads={h}')
89
+
90
+ def forward(s,x,e=None,m=None):
91
+ x=x+s.a(s.n[0](x),m)[0];x=x+s.m(s.n[1](x))
92
+ if e is not None:x=x+s.g(s.n[2](x).view(-1,x.size(-1)),e).view(x.size())
93
+ return x*s.f(x)
94
+
95
+ class MA(nn.Module):
96
+ def __init__(s,nc=Cfg.C,vs=Cfg.V,ed=Cfg.E,d=12,h=Cfg.H):
97
+ super().__init__()
98
+ s.e=nn.Embedding(vs,ed);s.p=nn.Parameter(torch.zeros(1,Cfg.S,ed))
99
+ s.t=nn.ModuleList([TB(ed,h)for _ in range(d)]);s.m=nn.Parameter(torch.zeros(1,Cfg.N,ed))
100
+ s.o=nn.Sequential(nn.Linear(ed,ed),nn.Tanh());s.c=nn.Sequential(nn.Linear(ed,ed//2),
101
+ nn.ReLU(),nn.Dropout(Cfg.D),nn.Linear(ed//2,nc));s._i()
102
+ P(f'Model architecture initialized: classes={nc}, vocab={vs}, dim={ed}')
103
+
104
+ def _i(s):
105
+ def i(l):
106
+ if isinstance(l,nn.Linear):nn.init.xavier_uniform_(l.weight)
107
+ if getattr(l,'bias',None)is not None:nn.init.zeros_(l.bias)
108
+ s.apply(i)
109
+
110
+ def forward(s,x,e=None):
111
+ B,N=x.shape;x=s.e(x)+s.p[:,:N]
112
+ for b in s.t:x=b(x,e,s.m)
113
+ return s.c(s.o(x.mean(1)))
114
+
115
+ class Opt:
116
+ def __init__(s,p,l=Cfg.L,w=Cfg.W,d=.01):
117
+ s.o=optim.AdamW(p,l,(.9,.999),1e-8,d)
118
+ s.s=optim.lr_scheduler.OneCycleLR(s.o,l,w,.1,'cos',True,25,1e4)
119
+ s.c=torch.cuda.amp.GradScaler();s.g=1.0
120
+ P('Optimizer initialized with AdamW and OneCycleLR')
121
+
122
+ def step(s,l):
123
+ s.c.scale(l).backward();s.c.unscale_(s.o)
124
+ torch.nn.utils.clip_grad_norm_(s.o.param_groups[0]['params'],s.g)
125
+ s.c.step(s.o);s.c.update();s.s.step();s.o.zero_grad()
126
+ class T:
127
+ def __init__(s,m,t,v,d):
128
+ s.m=m.to(d);s.t=t;s.v=v;s.d=d;s.o=Opt(m.parameters())
129
+ s.mt=defaultdict(list);s.c=nn.CrossEntropyLoss(label_smoothing=.1)
130
+ s.l=Log();s.b=-float('inf');s._m={}
131
+ P('Trainer initialized with device: '+str(d))
132
+
133
+ def e(s):
134
+ s.m.train();m=defaultdict(float)
135
+ for i,(x,y)in enumerate(tqdm(s.t,desc='Training')):
136
+ x,y=x.to(s.d),y.to(s.d)
137
+ with torch.cuda.amp.autocast():o=s.m(x);l=s.c(o,y)
138
+ s.o.step(l);b=s._c(o,y,l)
139
+ for k,v in b.items():m[k]+=v
140
+ if i%10==0:P(f'Batch {i}: Loss={l.item():.4f}')
141
+ return {k:v/len(s.t)for k,v in m.items()}
142
+
143
+ def v(s):
144
+ s.m.eval();m=defaultdict(float)
145
+ with torch.no_grad():
146
+ for x,y in tqdm(s.v,desc='Validating'):
147
+ x,y=x.to(s.d),y.to(s.d);o=s.m(x)
148
+ for k,v in s._c(o,y).items():m[k]+=v
149
+ r={k:v/len(s.v)for k,v in m.items()};s._u(r)
150
+ P(f'Validation metrics: {r}')
151
+ return r
152
+
153
+ def _c(s,o,t,l=None):
154
+ m={};m['l']=l.item()if l else 0
155
+ p=o.argmax(1);c=p.eq(t).sum().item();m['a']=c/t.size(0)
156
+ with torch.no_grad():
157
+ pb=F.softmax(o,1);cf=pb.max(1)[0].mean().item()
158
+ et=-torch.sum(pb*torch.log(pb+1e-10),1).mean().item()
159
+ m.update({'c':cf,'e':et})
160
+ return m
161
+
162
+ def t(s,e,p=None,es=5):
163
+ b=-float('inf');pc=0
164
+ for i in range(e):
165
+ tm=s.e();vm=s.v()
166
+ s.l.i(f'E{i+1}/{e}-TL:{tm["l"]:.4f},VL:{vm["l"]:.4f},VA:{vm["a"]:.4f}')
167
+ if vm['a']>b:
168
+ b=vm['a'];pc=0
169
+ else:
170
+ pc+=1
171
+ if p:
172
+ s._s(p,i,vm)
173
+ if pc>=es:
174
+ s.l.i(f'Early stop after {i+1} epochs');break
175
+ P(f'Epoch {i+1} completed')
176
+
177
+ def _s(s,p,e,m):
178
+ torch.save({'e':e,'m':s.m.state_dict(),'o':s.o.o.state_dict(),
179
+ 's':s.o.s.state_dict(),'m':m,'c':Cfg.d(),'t':time.strftime('%Y%m%d-%H%M%S')},p)
180
+ s.l.s(f'Checkpoint saved: {p}')
181
+ class D:
182
+ def __init__(s,p,b=Cfg.B,w=os.cpu_count()):
183
+ s.p=Path(p);s.b=b;s.w=w;s.pr=Res();s.l=Log()
184
+ s.t=s.v=s.e=None;P('DataModule initialized')
185
+
186
+ def s(s):
187
+ d=s._l();t,v,e=s._sp(d)
188
+ s.t,s.v,s.e=map(s._c,[t,v,e])
189
+ P(f'Datasets created: {len(s.t)}/{len(s.v)}/{len(s.e)} samples')
190
+
191
+ def _l(s):
192
+ d=[];f=list(s.p.rglob('*.xml'))
193
+ with ProcessPoolExecutor(s.w)as e:
194
+ fs=[e.submit(s._pf,f)for f in f]
195
+ for f in tqdm(as_completed(fs),total=len(f)):
196
+ if r:=f.result():d.append(r)
197
+ P(f'Loaded {len(d)} files')
198
+ return d
199
+
200
+ def _pf(s,f):
201
+ try:
202
+ t=ET.parse(f);r=t.getroot()
203
+ tx=' '.join(e.text for e in r.findall('.//text')if e.text)
204
+ p=s.pr.p(tx);l=r.find('.//label')
205
+ return{'f':p,'m':{'l':len(tx)},'l':l.text if l is not None else'UNK','p':str(f)}
206
+ except Exception as e:s.l.e(f'Error:{f}-{str(e)}');return None
207
+
208
+ def _sp(s,d):
209
+ np.random.seed(42);i=np.random.permutation(len(d))
210
+ t,v=int(.8*len(d)),int(.9*len(d))
211
+ return [d[j]for j in i[:t]],[d[j]for j in i[t:v]],[d[j]for j in i[v:]]
212
+
213
+ def _c(s,d):
214
+ f=torch.stack([torch.tensor(i['f'])for i in d])
215
+ l={x:i for i,x in enumerate(sorted(set(i['l']for i in d)))}
216
+ y=torch.tensor([l[i['l']]for i in d])
217
+ return TensorDataset(f,y)
218
+
219
+ def dl(s,t):
220
+ d=getattr(s,t);sh=t=='t'
221
+ return DataLoader(d,s.b,sh,s.w,True,t=='t')
222
+
223
+ class P:
224
+ def __init__(s, cfg_dict):
225
+ # Convert any string config to dict if needed
226
+ s.c = cfg_dict if isinstance(cfg_dict, dict) else {'p': cfg_dict, 'o': 'r_out'}
227
+ s.l = Log()
228
+ s.o = Path(s.c['o'] if 'o' in s.c else 'r_out')
229
+ s.o.mkdir(parents=True, exist_ok=True)
230
+ s.d = D(s.c['p'], s.c.get('b', Cfg.B))
231
+ s.v = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
232
+ P('Pipeline initialized with configuration')
233
+
234
+
235
+ def r(s):
236
+ s.l.i('Init pipeline');s.d.s()
237
+ m=s._im();t=T(m,s.d.dl('t'),s.d.dl('v'),s.v)
238
+ t.t(s.c.get('e',50),s.o/'ckpt'/'best.pth')
239
+ s._f(m,t);P('Pipeline completed')
240
+
241
+ def _im(s):
242
+ s.l.i('Init model')
243
+ return MA(len(s.d.t.tensors[1].unique()),Cfg.V,Cfg.E,s.c.get('md',12),Cfg.H).to(s.v)
244
+
245
+ def _f(s,m,t):
246
+ s.l.i('Finalizing');r=s._em(m,s.d.dl('e'))
247
+ s._ex(m,t,r);P('Results exported')
248
+
249
+ def _em(s,m,d):
250
+ m.eval();p,t=[],[];mt=defaultdict(list)
251
+ with torch.no_grad():
252
+ for x,y in tqdm(d,'Evaluating'):
253
+ x,y=x.to(s.v),y.to(s.v);o=m(x)
254
+ p.extend(o.argmax(1).cpu());t.extend(y.cpu())
255
+ for k,v in s._cm(o,y).items():mt[k].append(v)
256
+ return{'p':p,'t':t,'m':mt}
257
+ class M:
258
+ def __init__(s):s.h=defaultdict(list);s.c=defaultdict(float);P('Metrics initialized')
259
+
260
+ def u(s,m):
261
+ for k,v in m.items():s.h[k].append(v);s.c[k]=v
262
+ if len(s.h['l'])%10==0:P(f'Metrics updated: {dict(s.c)}')
263
+
264
+ def g(s):return{'h':dict(s.h),'c':dict(s.c)}
265
+
266
+ def _ce(c,a,n=15):
267
+ b=np.linspace(0,1,n+1);l,u=b[:-1],b[1:]
268
+ e=sum(abs(np.mean(c[np.logical_and(c>l,c<=h)])-np.mean(a[np.logical_and(c>l,c<=h)]))*
269
+ np.mean(np.logical_and(c>l,c<=h))for l,h in zip(l,u))
270
+ return float(e)
271
+
272
+ def _pd(p):return float(-torch.sum(p.mean(0)*torch.log(p.mean(0)+1e-10)))
273
+
274
+ def _exp(p,d,m,c):
275
+ p.mkdir(parents=True,exist_ok=True)
276
+ torch.save({'m':m.state_dict(),'c':c},p/'model.pt')
277
+ with open(p/'metrics.json','w')as f:json.dump(m,f)
278
+ P(f'Exported to {p}')
279
+ def main():
280
+ # Enhanced configuration handling
281
+ cfg = {
282
+ 'p': 'data/processed',
283
+ 'o': 'r_out',
284
+ 'm': Cfg.d(),
285
+ 'b': Cfg.B,
286
+ 'md': 12,
287
+ 'e': 50
288
+ }
289
+ P("Starting pipeline with configuration...")
290
+ pipeline = P(cfg)
291
+ pipeline.r()
292
+ P("Pipeline completed successfully!")
293
+
294
+ if __name__=='__main__':
295
+ print("Starting main execution...")
296
+ main()
297
+ print("Main execution completed.")
model.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import xml.etree.ElementTree as ET
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+ from typing import List, Dict, Any
7
+ from collections import defaultdict
8
+ from accelerate import Accelerator
9
+
10
+ class DynamicModel(nn.Module):
11
+ def __init__(self, sections: Dict[str, List[Dict[str, Any]]]):
12
+ super(DynamicModel, self).__init__()
13
+ self.sections = nn.ModuleDict()
14
+
15
+ for section_name, layers in sections.items():
16
+ self.sections[section_name] = nn.ModuleList()
17
+ for layer_params in layers:
18
+ self.sections[section_name].append(self.create_layer(layer_params))
19
+
20
+ def create_layer(self, layer_params: Dict[str, Any]) -> nn.Module:
21
+ layer = nn.Linear(layer_params['input_size'], layer_params['output_size'])
22
+ activation = layer_params.get('activation', 'relu')
23
+ if activation == 'relu':
24
+ return nn.Sequential(layer, nn.ReLU())
25
+ elif activation == 'tanh':
26
+ return nn.Sequential(layer, nn.Tanh())
27
+ elif activation == 'sigmoid':
28
+ return nn.Sequential(layer, nn.Sigmoid())
29
+ else:
30
+ return layer
31
+
32
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
33
+ for section_name, layers in self.sections.items():
34
+ for layer in layers:
35
+ x = layer(x)
36
+ return x
37
+
38
+ def parse_xml_file(file_path: str) -> List[Dict[str, Any]]:
39
+ tree = ET.parse(file_path)
40
+ root = tree.getroot()
41
+
42
+ layers = []
43
+ for prov in root.findall('.//prov'):
44
+ layer_params = {
45
+ 'input_size': 128, # Example: fixed input size
46
+ 'output_size': 256, # Example: fixed output size
47
+ 'activation': 'relu' # Default activation
48
+ }
49
+ layers.append(layer_params)
50
+
51
+ return layers
52
+
53
+ def create_model_from_folder(folder_path: str) -> DynamicModel:
54
+ sections = defaultdict(list)
55
+
56
+ for root, dirs, files in os.walk(folder_path):
57
+ for file in files:
58
+ if file.endswith('.xml'):
59
+ file_path = os.path.join(root, file)
60
+ try:
61
+ layers = parse_xml_file(file_path)
62
+ section_name = os.path.basename(root)
63
+ sections[section_name].extend(layers)
64
+ except Exception as e:
65
+ print(f"Error processing {file_path}: {str(e)}")
66
+
67
+ return DynamicModel(sections)
68
+
69
+ def main():
70
+ folder_path = 'Xml_Data'
71
+ model = create_model_from_folder(folder_path)
72
+
73
+ print(f"Created dynamic PyTorch model with sections: {list(model.sections.keys())}")
74
+
75
+ # Get first section's first layer's input size dynamically
76
+ first_section = list(model.sections.keys())[0]
77
+ first_layer = model.sections[first_section][0]
78
+ input_features = first_layer[0].in_features
79
+
80
+ # Create sample input tensor matching the model's expected input size
81
+ sample_input = torch.randn(1, input_features)
82
+ output = model(sample_input)
83
+ print(f"Sample output shape: {output.shape}")
84
+
85
+ # Initialize accelerator for distributed training
86
+ accelerator = Accelerator()
87
+
88
+ # Setup optimization components
89
+ optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
90
+ criterion = nn.CrossEntropyLoss()
91
+ num_epochs = 10
92
+
93
+ # Create synthetic dataset for demonstration
94
+ dataset = torch.utils.data.TensorDataset(
95
+ torch.randn(100, input_features),
96
+ torch.randint(0, 2, (100,))
97
+ )
98
+ train_dataloader = torch.utils.data.DataLoader(
99
+ dataset,
100
+ batch_size=16,
101
+ shuffle=True
102
+ )
103
+
104
+ # Prepare for distributed training
105
+ model, optimizer, train_dataloader = accelerator.prepare(
106
+ model,
107
+ optimizer,
108
+ train_dataloader
109
+ )
110
+
111
+ # Training loop
112
+ for epoch in range(num_epochs):
113
+ model.train()
114
+ total_loss = 0
115
+ for batch_idx, (inputs, labels) in enumerate(train_dataloader):
116
+ optimizer.zero_grad()
117
+ outputs = model(inputs)
118
+ loss = criterion(outputs, labels)
119
+ accelerator.backward(loss)
120
+ optimizer.step()
121
+ total_loss += loss.item()
122
+
123
+ avg_loss = total_loss / len(train_dataloader)
124
+ print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")
125
+
126
+ if __name__ == "__main__":
127
+ main()
princesse.dockerfile ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python 3.12 image as a base
2
+ FROM python:3.12-slim
3
+
4
+ # Set environment variables to prevent Python from writing .pyc files and buffering stdout/stderr
5
+ ENV PYTHONDONTWRITEBYTECODE=1
6
+ ENV PYTHONUNBUFFERED=1
7
+
8
+ # Install system dependencies
9
+ RUN apt-get update && apt-get install -y \
10
+ build-essential \
11
+ git \
12
+ wget \
13
+ curl \
14
+ unzip \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ # Create a working directory
18
+ WORKDIR /app
19
+
20
+ # Install Python dependencies
21
+ RUN pip install --upgrade pip
22
+ RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
23
+ RUN pip install transformers accelerate
24
+
25
+ # Copy the current directory contents into the container at /app
26
+ COPY . /app
27
+
28
+ # Copy the zip file containing XML and JSON data into the container
29
+ COPY data.zip /app/data.zip
30
+
31
+ # Unzip the data.zip file into the /app/data directory
32
+ RUN unzip /app/data.zip -d /app/data
33
+
34
+ # Set the default command to run when starting the container
35
+ CMD ["python", "your_script.py"]