Spaces:
Build error
Build error
File size: 1,293 Bytes
ae5152f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 12 15:26:44 2020
@author: luol2
"""
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.stem.porter import PorterStemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
import io
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R') or treebank_tag=='IN':
return wordnet.ADV
else:
return wordnet.NOUN
def ssplit_token_pos_lemma(in_text):
fout=io.StringIO()
line=in_text.strip()
line=line.replace('-',' - ').replace('/',' / ')
sentences = nltk.sent_tokenize(line)
sentences = [nltk.word_tokenize(sent) for sent in sentences]
# print(sentences)
for sent in sentences:
token_pos = nltk.pos_tag(sent)
for token in token_pos:
lemma = lemmatizer.lemmatize(token[0].lower(), get_wordnet_pos(token[1]))
stem = stemmer.stem(token[0].lower())
fout.write(token[0]+'\t'+lemma+'\t'+stem+'\t'+token[1]+'\n')
fout.write('\n')
return fout.getvalue() |