Spaces:

madhavkotecha
/

CRF-NLP

Sleeping

File size: 5,011 Bytes

import numpy as np
import nltk
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import gradio as gr
import re

nltk.download('brown')
nltk.download('universal_tagset')
corpus = nltk.corpus.brown.tagged_sents(tagset='universal')

sentence = [
    ('The', 'DET'),
    ('dog', 'NOUN'),
    ('jumps', 'VERB'),
    ('over', 'ADP'),
    ('the', 'DET'),
    ('car', 'NOUN')
]
corpus = list(corpus)
corpus[21058] = sentence

def word_features(sentence, i, prev_tag):
    word = sentence[i][0]
    features = {
        'word': word,
        'is_first': i == 0, #if the word is a first word
        'is_last': i == len(sentence) - 1,  #if the word is a last word
        'is_capitalized': word[0].upper() == word[0],
        'is_all_caps': word.upper() == word,      #word is in uppercase
        'is_all_lower': word.lower() == word,      #word is in lowercase
        'prefix-1': word[0],
        'prefix-2': word[:2],
        'prefix-3': word[:3],
        'prefix-un': word[:2] == 'un',   #if word starts with un
        'prefix-re': word[:2] == 're',   #if word starts with re
        'prefix-over': word[:4] == 'over',  #if word starts with over
        'prefix-dis': word[:4] == 'dis',   #if word starts with dis
        'prefix-mis': word[:4] == 'mis',   #if word starts with mis
        'prefix-pre': word[:4] == 'pre',   #if word starts with pre
        'prefix-non': word[:4] == 'non',   #if word starts with non
        'prefix-de': word[:3] == 'de',     #if word starts with de
        'prefix-in': word[:3] == 'in',     #if word starts with in
        'prefix-en': word[:3] == 'en',     #if word starts with en
        'suffix-1': word[-1],
        'suffix-2': word[-2:],
        'suffix-3': word[-3:],
        'suffix-ed': word[-2:] == 'ed',   #if word ends with ed
        'suffix-ing': word[-3:] == 'ing',  #if word ends with ing
        'suffix-es': word[-2:] == 'es',    #if word ends with es
        'suffix-s': word[-1] == 's',       #if word ends with s
        'suffix-ly': word[-2:] == 'ly',    #if word ends with ly
        'suffix-ment': word[-4:] == 'ment',  #if word ends with ment
        'suffix-er': word[-2:] == 'er',     #if word ends with er
        'prev_word': '' if i == 0 else sentence[i-1][0],
        'next_word': '' if i == len(sentence)-1 else sentence[i+1][0],  
        'has_hyphen': '-' in word,    #if word has hypen
        'is_numeric': word.isdigit(),  #if word is in numeric
        'capitals_inside': word[1:].lower() != word[1:],
        'is_first_capital': word[0].upper() == word[0],  #if first letter is in uppercase
        'suffix-s_and-prev_tag_noun': word[-1] == 's' and prev_tag == 'NOUN',  #if word ends with s and previous tag is NOUN
        'prev_tag': prev_tag,
    }
    return features

X = []
y = []
for sentence in corpus:
    X_sentence = []
    y_sentence = []
    for i in range(len(sentence)):
        X_sentence.append(word_features(sentence, i, '' if i == 0 else sentence[i-1][1]))
        y_sentence.append(sentence[i][1])
    X.append(X_sentence)
    y.append(y_sentence)
 
 
# Split the data into training and testing sets
split = int(0.8 * len(X))
X_train = X[:split]
y_train = y[:split]
X_test = X[split:]
y_test = y[split:]

# Train a CRF model on the training data
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)
 
# Make predictions on the test data and evaluate the performance
y_pred = crf.predict(X_test)
 
print(metrics.flat_accuracy_score(y_test, y_pred))

def predict_tags(sentence):
    tokens = sentence.split()
    tokens2 = [(token, '') for token in tokens]
    features = []
    prev_prev_tag = ''
    prev_tag = ''
    for i in range(len(tokens)):
        features.append(word_features(tokens2, i, prev_tag))
        if i > 0:
            prev_tag = crf.predict([features[:i]])[0][i-1]

    predicted_tags = crf.predict([features])[0]
    return list(zip(tokens, predicted_tags))
    

# Example usage
new_sentence = "The dog walks over the car"
predicted_tags = predict_tags(new_sentence)
print(predicted_tags)

def tagging(input):
    input = (re.sub(r'(\S)([.,;:!?])', r'\1 \2', input.strip()))
    tagged_list = predict_tags(input)
    output = ''.join(f"{word}[{tag}]   " for word, tag in tagged_list)
    return output


interface = gr.Interface(fn = tagging,
                         inputs = gr.Textbox(
                             label="Input Sentence",
                             placeholder="Enter your sentence here...",
                         ),
                         outputs = gr.Textbox(
                             label="Tagged Output",
                             placeholder="Tagged sentence appears here...",
                         ),
                         title = "Conditional Random Field POS Tagger",
                         description = "CS626 Assignment 1B (Autumn 2024)",
                         theme=gr.themes.Soft())
interface.launch(inline = False)