import nltk nltk.download('stopwords') nltk.download('wordnet') nltk.download('punkt') from nltk.corpus import stopwords,wordnet from nltk.tokenize import sent_tokenize import string import subprocess import logging try: import pke logging.error("importing pke info") except: logging.error("installing pke info") subprocess.run(['pip3', 'install','git+https://github.com/boudinfl/pke.git']) subprocess.run(['python3' ,'-m' ,'spacy' ,'download' ,'en']) import pke stoplist = list(string.punctuation) stoplist += pke.lang.stopwords.get('en') stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] stoplist += stopwords.words('english') def tokenize_sentence(text): sentences=sent_tokenize(text) sentences=[s.strip().lstrip().rstrip() for s in sentences if len(s) > 20] return sentences def get_multipartiterank_topics(text): output = [] try: extractor = pke.unsupervised.MultipartiteRank() extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist) # keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB' extractor.candidate_selection(pos={'NOUN','VERB','ADJ'}) extractor.candidate_weighting(threshold=0.7,method='average',alpha=1.1) keyphrases = extractor.get_n_best(n=5) for val in keyphrases: output.append(val[0]) except Exception as e: print("found exception",e) return list(set(output)) def get_topicrank_topics(text): output = [] try: extractor = pke.unsupervised.TopicRank() extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist) # keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB' extractor.candidate_selection(pos={'NOUN', 'ADJ'}) extractor.candidate_weighting(threshold=0.7,method='average') keyphrases = extractor.get_n_best(n=5) for val in keyphrases: output.append(val[0]) except Exception as e: print("found exception",e) return list(set(output)) def get_yake_topics(text): #statistics model --very poor performance output = [] try: extractor = pke.unsupervised.YAKE() extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist) extractor.candidate_selection(n=3) extractor.candidate_weighting(window=2) keyphrases = extractor.get_n_best(n=5,threshold=0.9) for val in keyphrases: output.append(val[0]) except Exception as e: print("found exception",e) return list(set(output))