dayannex commited on
Commit
8305abf
·
1 Parent(s): f088a2a

dataset ingles

Browse files
Files changed (1) hide show
  1. app.py +58 -19
app.py CHANGED
@@ -331,16 +331,16 @@ class ModeloDataset:
331
  new_tokens=[]
332
  ig_tokens=[]
333
  for token in tokens:
334
- print('tokensss:',tokens,caracter)
335
  ind=len(new_tokens)
336
  if i<len(tokens):
337
- if token.startswith(caracter):
338
 
339
  new_tokens.append(token)
340
 
341
  i=i+1
342
  else:
343
- new_tokens[ind-1] = (new_tokens[ind-1] + token)
344
  ig_tokens.append(i)
345
 
346
  i=i+1
@@ -348,6 +348,32 @@ class ModeloDataset:
348
  new_tokens,
349
  ig_tokens
350
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  def reordenacion_identificadores(self,ig_tokens,predicted_tokens_classes, tamano):
352
  x=0
353
  new_identificadores=[]
@@ -449,29 +475,42 @@ class ModeloDataset:
449
  else:
450
 
451
  print('idioma:',idioma)
452
- self.tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-english")
453
- tokenized_text=[self.tokenizer.tokenize(sentence[:500]) for sentence in _sentences]
454
-
455
- ids = [self.tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]
 
 
456
 
457
 
458
- MAX_LEN=128
459
- ids=pad_sequences(ids,maxlen=MAX_LEN,dtype="long",truncating="post", padding="post")
460
- input_ids = torch.tensor(ids)
461
 
462
-
463
- self.model = AutoModelForTokenClassification.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-english")
464
  with torch.no_grad():
465
- logits = self.model(input_ids).logits
466
- predicted_token_class_ids = logits.argmax(-1)
 
 
 
467
  i=0
468
  _predicted_tokens_classes=[]
469
  for a in predicted_token_class_ids:
470
 
471
- _predicted_tokens_classes.append([self.model.config.id2label[t.item()] for t in predicted_token_class_ids[i]])
472
- i=i+1
473
- labels = predicted_token_class_ids
474
- loss = self.model(input_ids, labels=labels).loss
 
 
 
 
 
 
 
 
 
 
475
 
476
  new_tokens=[]
477
  ig_tok=[]
@@ -479,7 +518,7 @@ class ModeloDataset:
479
  new_identificadores=[]
480
  for item in tokenized_text:
481
 
482
- aux1, aux2= self.reordenacion_tokens(item,"")
483
  new_tokens.append(aux1)
484
  ig_tok.append(aux2)
485
 
 
331
  new_tokens=[]
332
  ig_tokens=[]
333
  for token in tokens:
334
+ print('token_texto:',token,caracter)
335
  ind=len(new_tokens)
336
  if i<len(tokens):
337
+ if not token.startswith(caracter):
338
 
339
  new_tokens.append(token)
340
 
341
  i=i+1
342
  else:
343
+ new_tokens[ind-1] = (new_tokens[ind-1] + token.replace(caracter,''))
344
  ig_tokens.append(i)
345
 
346
  i=i+1
 
348
  new_tokens,
349
  ig_tokens
350
  )
351
+
352
+ def reordenacion_tokens_es(self,tokens,caracter):
353
+
354
+ i=0
355
+ new_tokens=[]
356
+ ig_tokens=[] #ignorar estos indices del array de indentificadores
357
+ for token in tokens:
358
+ ind=len(new_tokens)
359
+ if i<len(tokens):
360
+ if token.startswith(caracter):
361
+
362
+ new_tokens.append(token)
363
+
364
+ i=i+1
365
+ else:
366
+ #if i==0: new_tokens.append(token)
367
+ #else:
368
+ new_tokens[ind-1] = (new_tokens[ind-1] + token.replace(caracter,''))
369
+ ig_tokens.append(i)
370
+
371
+ i=i+1
372
+ return (
373
+ new_tokens,
374
+ ig_tokens
375
+ )
376
+
377
  def reordenacion_identificadores(self,ig_tokens,predicted_tokens_classes, tamano):
378
  x=0
379
  new_identificadores=[]
 
475
  else:
476
 
477
  print('idioma:',idioma)
478
+ self.tokenizer = AutoTokenizer.from_pretrained("dayannex/distilbert-tuned-4labels")
479
+ self.model = AutoModelForTokenClassification.from_pretrained("dayannex/distilbert-tuned-4labels")
480
+
481
+ inputs=[self.tokenizer(sentence[:500], return_tensors="pt") for sentence in _sentences]
482
+ print('inputs',inputs)
483
+ #ids = [self.tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]
484
 
485
 
486
+ #MAX_LEN=128
487
+ #ids=pad_sequences(ids,maxlen=MAX_LEN,dtype="long",truncating="post", padding="post")
488
+ #input_ids = torch.tensor(ids)
489
 
 
 
490
  with torch.no_grad():
491
+ outputs = self.model(**inputs)
492
+ logits = outputs.logits
493
+ predicted_token_class_ids = torch.argmax(logits, dim=2)
494
+
495
+ #predicted_token_class_ids = predicted_token_class_ids[0].tolist()
496
  i=0
497
  _predicted_tokens_classes=[]
498
  for a in predicted_token_class_ids:
499
 
500
+ _predicted_tokens_classes.append( [self.model.config.id2label[label_id] for label_id in predicted_token_class_ids[i]])
501
+ i=i+1
502
+ print('_predicted_tokens_classes:',_predicted_tokens_classes[0])
503
+ #with torch.no_grad():
504
+ # logits = self.model(input_ids).logits
505
+ #predicted_token_class_ids = logits.argmax(-1)
506
+ #i=0
507
+ #_predicted_tokens_classes=[]
508
+ #for a in predicted_token_class_ids:
509
+
510
+ # _predicted_tokens_classes.append([self.model.config.id2label[t.item()] for t in predicted_token_class_ids[i]])
511
+ # i=i+1
512
+ #labels = predicted_token_class_ids
513
+ #loss = self.model(input_ids, labels=labels).loss
514
 
515
  new_tokens=[]
516
  ig_tok=[]
 
518
  new_identificadores=[]
519
  for item in tokenized_text:
520
 
521
+ aux1, aux2= self.reordenacion_tokens(item,"#")
522
  new_tokens.append(aux1)
523
  ig_tok.append(aux2)
524