Spaces:
Sleeping
Sleeping
dataset ingles
Browse files
app.py
CHANGED
@@ -331,16 +331,16 @@ class ModeloDataset:
|
|
331 |
new_tokens=[]
|
332 |
ig_tokens=[]
|
333 |
for token in tokens:
|
334 |
-
print('
|
335 |
ind=len(new_tokens)
|
336 |
if i<len(tokens):
|
337 |
-
if
|
338 |
|
339 |
new_tokens.append(token)
|
340 |
|
341 |
i=i+1
|
342 |
else:
|
343 |
-
new_tokens[ind-1] = (new_tokens[ind-1] + token)
|
344 |
ig_tokens.append(i)
|
345 |
|
346 |
i=i+1
|
@@ -348,6 +348,32 @@ class ModeloDataset:
|
|
348 |
new_tokens,
|
349 |
ig_tokens
|
350 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
def reordenacion_identificadores(self,ig_tokens,predicted_tokens_classes, tamano):
|
352 |
x=0
|
353 |
new_identificadores=[]
|
@@ -449,29 +475,42 @@ class ModeloDataset:
|
|
449 |
else:
|
450 |
|
451 |
print('idioma:',idioma)
|
452 |
-
self.tokenizer = AutoTokenizer.from_pretrained("
|
453 |
-
|
454 |
-
|
455 |
-
|
|
|
|
|
456 |
|
457 |
|
458 |
-
MAX_LEN=128
|
459 |
-
ids=pad_sequences(ids,maxlen=MAX_LEN,dtype="long",truncating="post", padding="post")
|
460 |
-
input_ids = torch.tensor(ids)
|
461 |
|
462 |
-
|
463 |
-
self.model = AutoModelForTokenClassification.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-english")
|
464 |
with torch.no_grad():
|
465 |
-
|
466 |
-
|
|
|
|
|
|
|
467 |
i=0
|
468 |
_predicted_tokens_classes=[]
|
469 |
for a in predicted_token_class_ids:
|
470 |
|
471 |
-
_predicted_tokens_classes.append([self.model.config.id2label[
|
472 |
-
i=i+1
|
473 |
-
|
474 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
475 |
|
476 |
new_tokens=[]
|
477 |
ig_tok=[]
|
@@ -479,7 +518,7 @@ class ModeloDataset:
|
|
479 |
new_identificadores=[]
|
480 |
for item in tokenized_text:
|
481 |
|
482 |
-
aux1, aux2= self.reordenacion_tokens(item,"
|
483 |
new_tokens.append(aux1)
|
484 |
ig_tok.append(aux2)
|
485 |
|
|
|
331 |
new_tokens=[]
|
332 |
ig_tokens=[]
|
333 |
for token in tokens:
|
334 |
+
print('token_texto:',token,caracter)
|
335 |
ind=len(new_tokens)
|
336 |
if i<len(tokens):
|
337 |
+
if not token.startswith(caracter):
|
338 |
|
339 |
new_tokens.append(token)
|
340 |
|
341 |
i=i+1
|
342 |
else:
|
343 |
+
new_tokens[ind-1] = (new_tokens[ind-1] + token.replace(caracter,''))
|
344 |
ig_tokens.append(i)
|
345 |
|
346 |
i=i+1
|
|
|
348 |
new_tokens,
|
349 |
ig_tokens
|
350 |
)
|
351 |
+
|
352 |
+
def reordenacion_tokens_es(self,tokens,caracter):
|
353 |
+
|
354 |
+
i=0
|
355 |
+
new_tokens=[]
|
356 |
+
ig_tokens=[] #ignorar estos indices del array de indentificadores
|
357 |
+
for token in tokens:
|
358 |
+
ind=len(new_tokens)
|
359 |
+
if i<len(tokens):
|
360 |
+
if token.startswith(caracter):
|
361 |
+
|
362 |
+
new_tokens.append(token)
|
363 |
+
|
364 |
+
i=i+1
|
365 |
+
else:
|
366 |
+
#if i==0: new_tokens.append(token)
|
367 |
+
#else:
|
368 |
+
new_tokens[ind-1] = (new_tokens[ind-1] + token.replace(caracter,''))
|
369 |
+
ig_tokens.append(i)
|
370 |
+
|
371 |
+
i=i+1
|
372 |
+
return (
|
373 |
+
new_tokens,
|
374 |
+
ig_tokens
|
375 |
+
)
|
376 |
+
|
377 |
def reordenacion_identificadores(self,ig_tokens,predicted_tokens_classes, tamano):
|
378 |
x=0
|
379 |
new_identificadores=[]
|
|
|
475 |
else:
|
476 |
|
477 |
print('idioma:',idioma)
|
478 |
+
self.tokenizer = AutoTokenizer.from_pretrained("dayannex/distilbert-tuned-4labels")
|
479 |
+
self.model = AutoModelForTokenClassification.from_pretrained("dayannex/distilbert-tuned-4labels")
|
480 |
+
|
481 |
+
inputs=[self.tokenizer(sentence[:500], return_tensors="pt") for sentence in _sentences]
|
482 |
+
print('inputs',inputs)
|
483 |
+
#ids = [self.tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]
|
484 |
|
485 |
|
486 |
+
#MAX_LEN=128
|
487 |
+
#ids=pad_sequences(ids,maxlen=MAX_LEN,dtype="long",truncating="post", padding="post")
|
488 |
+
#input_ids = torch.tensor(ids)
|
489 |
|
|
|
|
|
490 |
with torch.no_grad():
|
491 |
+
outputs = self.model(**inputs)
|
492 |
+
logits = outputs.logits
|
493 |
+
predicted_token_class_ids = torch.argmax(logits, dim=2)
|
494 |
+
|
495 |
+
#predicted_token_class_ids = predicted_token_class_ids[0].tolist()
|
496 |
i=0
|
497 |
_predicted_tokens_classes=[]
|
498 |
for a in predicted_token_class_ids:
|
499 |
|
500 |
+
_predicted_tokens_classes.append( [self.model.config.id2label[label_id] for label_id in predicted_token_class_ids[i]])
|
501 |
+
i=i+1
|
502 |
+
print('_predicted_tokens_classes:',_predicted_tokens_classes[0])
|
503 |
+
#with torch.no_grad():
|
504 |
+
# logits = self.model(input_ids).logits
|
505 |
+
#predicted_token_class_ids = logits.argmax(-1)
|
506 |
+
#i=0
|
507 |
+
#_predicted_tokens_classes=[]
|
508 |
+
#for a in predicted_token_class_ids:
|
509 |
+
|
510 |
+
# _predicted_tokens_classes.append([self.model.config.id2label[t.item()] for t in predicted_token_class_ids[i]])
|
511 |
+
# i=i+1
|
512 |
+
#labels = predicted_token_class_ids
|
513 |
+
#loss = self.model(input_ids, labels=labels).loss
|
514 |
|
515 |
new_tokens=[]
|
516 |
ig_tok=[]
|
|
|
518 |
new_identificadores=[]
|
519 |
for item in tokenized_text:
|
520 |
|
521 |
+
aux1, aux2= self.reordenacion_tokens(item,"#")
|
522 |
new_tokens.append(aux1)
|
523 |
ig_tok.append(aux2)
|
524 |
|